/**
  @file
  @brief Main Converse header file.  Everything in Converse is 
  either declared here directly, or else included from here.

  Routine names in Converse include a short prefix starting with "C".
  These generally tell you what category the routine is in:
  - Cmi, Converse Machine Interface, the machine layer. (machine.C)
     Also used for various generic low-level features.
  - Cth, Converse threads, user-level nonpreemptive threads. (threads.C)
  - Ccd, Converse condition detection, similar to signal handling. (conv-conds.C)
  - Ccs, Converse client/server, socket access to parallel job. (conv-ccs.C; ccs-*)
  - Cpd, Converse parallel debugger. (debug-*)
  - Crn, Converse random number generation. (random.C)
  - Csd, Converse scheduler daemon.  (convcore.C)
  - Cqs, Converse prioritized queueing system. (queueing.c)
  - CQd, Converse quiesence detection. (quiescense.c)
     Not the same as Charm++ QD. (qd.C)
  
  Globally accessible variables in Converse can be shared as:
  - Ctv, Converse thread-private variable.
     one copy per Converse user-level thread.
  - Cpv, Converse processor-private variable.  Very common;
     one copy per processor of each node.
  - Csv, Converse node-shared variable.  Global variables 
     shared by all processors of a node (beware of thread safety!)

  @ingroup CharmScheduler
  @ingroup ConverseScheduler

*/
#ifndef CONVERSE_H
#define CONVERSE_H

/*
 * We cannot use thread_local here because "extern thread_local" variables
 * have additional initialization semantics that the RTS does not consider.
 * https://stackoverflow.com/a/13123870
 *
 * KEEPINSYNC: GKlib.h
 */
#if defined _MSC_VER
# define CMK_THREADLOCAL __declspec(thread)
#else
# define CMK_THREADLOCAL __thread
#endif

#if defined _MSC_VER
# define CMI_FORCE_INLINE __forceinline
#elif defined __GNUC__
# define CMI_FORCE_INLINE inline __attribute__((always_inline))
#else
# define CMI_FORCE_INLINE inline
#endif

#if defined(__GNUC__) && !defined(__clang__)
#define CMI_NOOPTIMIZE __attribute__((optimize(0)))
#else
#define CMI_NOOPTIMIZE
#endif

#include "conv-header.h"

/* Root of broadcast:
 * non-bcast msg: root = 0;
 * proc-level bcast msg: root >=1; (CmiMyPe()+1)
 * node-level bcast msg: root <=-1; (-CmiMyNode()-1)
 */
#define CMI_BROADCAST_ROOT(msg)          ((CmiMsgHeaderBasic *)msg)->root
#define CMI_SET_BROADCAST_ROOT(msg, root)  CMI_BROADCAST_ROOT(msg) = (root);

#define CMI_ZC_MSGTYPE(msg)                  ((CmiMsgHeaderBasic *)msg)->zcMsgType
#define CMI_IS_ZC_P2P(msg)                   (CMI_ZC_MSGTYPE(msg) == CMK_ZC_P2P_SEND_MSG || CMI_ZC_MSGTYPE(msg) == CMK_ZC_P2P_RECV_MSG)
#define CMI_IS_ZC_BCAST(msg)                 (CMI_ZC_MSGTYPE(msg) == CMK_ZC_BCAST_SEND_MSG || CMI_ZC_MSGTYPE(msg) == CMK_ZC_BCAST_RECV_MSG)
#define CMI_IS_ZC_RECV(msg)                  (CMI_ZC_MSGTYPE(msg) == CMK_ZC_P2P_RECV_MSG || CMI_ZC_MSGTYPE(msg) == CMK_ZC_BCAST_RECV_MSG)
#define CMI_IS_ZC(msg)                       (CMI_IS_ZC_P2P(msg) || CMI_IS_ZC_BCAST(msg))
#define CMI_IS_ZC_DEVICE(msg)                (CMI_ZC_MSGTYPE(msg) == CMK_ZC_DEVICE_MSG)

#define CMI_MSG_NOKEEP(msg)                  ((CmiMsgHeaderBasic *)msg)->nokeep

#define CMIALIGN(x,n)       (size_t)((~((size_t)n-1))&((x)+(n-1)))
/*#define ALIGN8(x)        (size_t)((~7)&((x)+7)) */
#define ALIGN8(x)          CMIALIGN(x,8)
#define ALIGN16(x)         CMIALIGN(x,16)

#if !defined(ALIGN_BYTES)
#if CMK_64BIT
#define ALIGN_BYTES           16U
#else
#define ALIGN_BYTES           8U
#endif
#endif

#define ALIGN_DEFAULT(x) CMIALIGN(x, ALIGN_BYTES)

#define CMIPADDING(x, n) (CMIALIGN((x), (n)) - (size_t)(x))

/**
  Grab configuration files generated by build and configure scripts. 
*/
#ifndef __STDC_FORMAT_MACROS
# define __STDC_FORMAT_MACROS
#endif
#ifndef __STDC_LIMIT_MACROS
# define __STDC_LIMIT_MACROS
#endif
#include <stdint.h>
#include <inttypes.h>

#include "cmiqueue.h"

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#if defined __cplusplus
#include <atomic>
#elif defined __STDC_VERSION__ && __STDC_VERSION__ >= 201112L && !__STDC_NO_ATOMICS__ && \
      (!defined __GNUC__ || __GNUC__ > 7 || (__GNUC__ == 7 && __GNUC_MINOR__ > 0) || defined __clang__ || \
      (!defined _OPENMP && !defined __OBJC__))
// see GCC SVN r239970 and https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65467
#include <stdatomic.h>
#define CMK_HAS_C11_STDATOMIC 1
#endif

/* brittle accommodation of libc header internals */
#if defined __cplusplus && defined __THROW
# define CMK_THROW __THROW
#else
# define CMK_THROW
#endif

#ifndef __has_builtin
# define __has_builtin(x) 0  // Compatibility with non-clang compilers.
#endif
#if (defined __GNUC__ || __has_builtin(__builtin_unreachable)) && !defined _CRAYC
// Technically GCC 4.5 is the minimum for this feature, but we require C++11.
# define CMI_UNREACHABLE_SECTION(...) __builtin_unreachable()
#elif _MSC_VER
# define CMI_UNREACHABLE_SECTION(...) __assume(0)
#else
# define CMI_UNREACHABLE_SECTION(...) __VA_ARGS__
#endif

#define CMI_NORETURN_FUNCTION_END CMI_UNREACHABLE_SECTION(while(1));

# if defined __cplusplus
#  define CMK_NORETURN [[noreturn]]
# else
#  if defined _Noreturn
#   define CMK_NORETURN _Noreturn
#  elif defined _MSC_VER && 1200 <= _MSC_VER
#   define CMK_NORETURN __declspec (noreturn)
#  else
#   define CMK_NORETURN __attribute__ ((__noreturn__))
#  endif
# endif

// must be placed before return type and at both declaration and definition
#if defined __GNUC__ && __GNUC__ >= 4
# define CMI_WARN_UNUSED_RESULT __attribute__ ((warn_unused_result))
#elif defined _MSC_VER && _MSC_VER >= 1700
# define CMI_WARN_UNUSED_RESULT _Check_return_
#else
# define CMI_WARN_UNUSED_RESULT
#endif

#if defined __cplusplus && __cplusplus >= 201402L
#  define CMK_DEPRECATED_MSG(x) [[deprecated(x)]]
#  define CMK_DEPRECATED [[deprecated]]
#elif defined __GNUC__ || defined __clang__
#  define CMK_DEPRECATED_MSG(x) __attribute__((deprecated(x)))
#  define CMK_DEPRECATED __attribute__((deprecated))
#elif defined _MSC_VER
#  define CMK_DEPRECATED_MSG(x) __declspec(deprecated(x))
#  define CMK_DEPRECATED __declspec(deprecated)
#else
#  define CMK_DEPRECATED_MSG(x)
#  define CMK_DEPRECATED
#endif

/* Paste the tokens x and y together, without any space between them.
   The ANSI C way to do this is the bizarre ## "token-pasting" 
   preprocessor operator.
 */
#define CMK_CONCAT(x,y) x##y
/* Tag variable y as being from unit x: */
#define CMK_TAG(x,y) x##y##_

#include "pup_c.h"

/* the following flags denote properties of the C compiler,  */
/* not the C++ compiler.  If this is C++, ignore them.       */
#ifdef __cplusplus

#if ! CMK_HAS_OFFSETOF
#undef offsetof
#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER)
#else
#include <stddef.h>
#endif

extern "C" {
#endif

#ifdef _WIN32
# define CmiAlignedAlloc(alignment, size) _aligned_malloc((size), (alignment))
# define CmiAlignedFree(ptr) _aligned_free(ptr)
#else
void * CmiAlignedAlloc(size_t alignment, size_t size);
# define CmiAlignedFree(ptr) free(ptr)
#endif

/* Global variables used by charmdebug to maintain information */
extern void CpdSetInitializeMemory(int v);
extern void CpdSystemEnter(void);
extern void CpdSystemExit(void);
#if CMK_CHARMDEBUG
extern int memory_status_info;
extern int memory_chare_id;
#define setMemoryStatus(p) { \
  int tmp = memory_status_info; \
  memory_status_info = p; \
  p = tmp; \
}
int setMemoryChareIDFromPtr(void *p);
void setMemoryChareID(int id);
void setMemoryOwnedBy(void *p, int id);
#else
#define setMemoryStatus(p) /* empty */
#define setMemoryChareIDFromPtr(p) /* empty */
#define setMemoryChareID(p) /* empty */
#define setMemoryOwnedBy(p, id) /* empty */
#endif

/******************************************************************************
 *
 * Deal with Shared Memory
 *
 * Shared memory strongly affects how CPV, CSV, and CmiMyPe are defined,
 * and how memory locking is performed. Therefore, we control all these
 * functions with a single flag.
 *
 *****************************************************************************/

#ifdef __cplusplus
/* In C++, use new so t's constructor gets called */
# define CpvInit_Alloc(t,n) new t[n]()
# define CpvInit_Alloc_scalar(t) new t()
#else
# define CpvInit_Alloc(t,n) (t *)calloc(n,sizeof(t))
# define CpvInit_Alloc_scalar(t) (t *)calloc(1,sizeof(t))
#endif

extern int CmiMyRank_(void);

#if CMK_HAS_PARTITION

typedef enum Partition_Type {
      PARTITION_SINGLETON,
      PARTITION_DEFAULT,
      PARTITION_MASTER,
      PARTITION_PREFIX
} Partition_Type;

/* variables and functions for partition */
typedef struct {
  Partition_Type type;
  int isTopoaware, scheme;
  int numPartitions;
  int *partitionSize;
  int *partitionPrefix;
  int *nodeMap;
  int myPartition;
  char *partsizes;
} PartitionInfo;

void CmiCreatePartitions(char **argv);
#if defined(__cplusplus)
extern "C" {
#endif
void CmiSetNumPartitions(int nump);
void CmiSetMasterPartition(void);
void CmiSetPartitionSizes(char *size);
void CmiSetPartitionScheme(int scheme);
void CmiSetCustomPartitioning(void);
#if defined(__cplusplus)
}
#endif 

extern int _Cmi_mype_global;
extern int _Cmi_numpes_global;
extern int _Cmi_mynode_global;
extern int _Cmi_numnodes_global;
extern PartitionInfo _partitionInfo;

#define CmiNumPartitions()              _partitionInfo.numPartitions
#define CmiMyPartition()                _partitionInfo.myPartition
#define CmiPartitionSize(part)          _partitionInfo.partitionSize[part]
#define CmiMyPartitionSize()            CmiPartitionSize(CmiMyPartition())
#define CmiNumNodesGlobal()             _Cmi_numnodes_global
#define CmiMyNodeGlobal()               _Cmi_mynode_global
#define CmiNumPesGlobal()               _Cmi_numpes_global
/* we need different implementations of this based on SMP or non-smp */
#if !CMK_SMP
#define CmiMyPeGlobal()                 _Cmi_mype_global
extern int _Cmi_mynodesize;
#else
extern int CmiMyPeGlobal(void);
#endif

/* we need nodeSpan to find how many pes each node cover */
int CmiNodeSpan(void);

/* functions to translate between local and global */
int node_lToGTranslate(int node, int partition);
int node_gToLTranslate(int node);
int pe_lToGTranslate(int pe, int partition);
int pe_gToLTranslate(int pe);

#define CmiGetPeGlobal(pe,part)         pe_lToGTranslate(pe,part)
#define CmiGetNodeGlobal(node,part)     node_lToGTranslate(node,part)
#define CmiGetPeLocal(pe)               pe_gToLTranslate(pe)
#define CmiGetNodeLocal(node)           node_gToLTranslate(node)
/* end of variables and functions for partition */

#else

#define CmiMyPartition()         0
#define CmiPartitionSize(part)       CmiNumNodes()
#define CmiMyPartitionSize()         CmiNumNodes()
#define CmiNumPartitions()       1
#define CmiNumNodesGlobal()      CmiNumNodes()
#define CmiMyNodeGlobal()        CmiMyNode()
#define CmiNumPesGlobal()        CmiNumPes()
#define CmiMyPeGlobal()          CmiMyPe()
#if !CMK_SMP
extern int _Cmi_mynodesize;
#endif
#define CmiGetPeGlobal(pe,part)         (pe)
#define CmiGetNodeGlobal(node,part)     (node)
#define CmiGetPeLocal(pe)               (pe)
#define CmiGetNodeLocal(node)           (node)
#endif

typedef struct {
  int parent;
  int child_count;
  int *children;
} CmiSpanningTreeInfo;

extern CmiSpanningTreeInfo* _topoTree; // this node's parent and children in topo-tree rooted at 0

#if CMK_SHARED_VARS_UNAVAILABLE /* Non-SMP version of shared vars. */
extern int _Cmi_mype;
extern int _Cmi_numpes;
extern int _Cmi_myrank; /* Normally zero; only 1 during SIGIO handling */
extern int _Cmi_mynode;
extern int _Cmi_numnodes;

#define CmiMyPe()           _Cmi_mype
#define CmiMyRank()         0
#define CmiNumPes()         _Cmi_numpes
#define CmiMyNodeSize()     1
#define CmiMyNode()         _Cmi_mype
#define CmiNumNodes()       _Cmi_numpes
#define CmiNodeFirst(node)  (node)
#define CmiNodeSize(node)   1
#define CmiNodeOf(pe)       (pe)
#define CmiRankOf(pe)       0

#define CpvDeclare(t,v) t CMK_TAG(Cpv_,v)[2]
#define CpvExtern(t,v)  extern t CMK_TAG(Cpv_,v)[2]
#ifdef __cplusplus
#define CpvCExtern(t,v)    extern "C" t CMK_TAG(Cpv_,v)[2]
#else
#define CpvCExtern(t,v)    CpvExtern(t,v)
#endif
#define CpvStaticDeclare(t,v) static t CMK_TAG(Cpv_,v)[2]
#define CpvInitialize(t,v) do {} while(0)
#define CpvInitialized(v) 1
#define CpvAccess(v) CMK_TAG(Cpv_,v)[_Cmi_myrank]
#define CpvAccessOther(v, r) CMK_TAG(Cpv_,v)[r]

extern void CmiMemLock(void);
extern void CmiMemUnlock(void);
#define CmiNodeBarrier() /*empty*/
#define CmiNodeAllBarrier() /*empty*/
#define CmiSvAlloc CmiAlloc

#if CMK_USE_LRTS /*LRTS provides locking*/
#include "lrtslock.h"
#else
typedef int CmiNodeLock;
#define CmiCreateLock() (0)
#define CmiLock(lock) {(lock)++;}
#define CmiUnlock(lock)  {(lock)--;}
#define CmiTryLock(lock)  ((lock)?1:((lock)=1,0))
#define CmiDestroyLock(lock) /*empty*/
#endif // CMK_USE_LRTS

#define CmiInCommThread() (0)

#endif

#if CMK_SHARED_VARS_POSIX_THREADS_SMP /*Used by the netlrts-*-smp versions*/

#include <pthread.h>
#include <sched.h>
#ifdef CMK_FAKE_SCHED_YIELD
#include <unistd.h>
#define sched_yield() sleep(0)
#endif

extern int _Cmi_numpes;
extern int _Cmi_mynodesize;
extern int _Cmi_mynode;
extern int _Cmi_numnodes;
extern int _Cmi_sleepOnIdle;
extern int _Cmi_forceSpinOnIdle;

int CmiMyPe(void);
int CmiMyRank(void);
#define CmiNumPes()         _Cmi_numpes
#define CmiMyNodeSize()     _Cmi_mynodesize
int CmiNodeSize(int node);
#if CMK_MULTICORE
#define CmiMyNode()         0
#define CmiNumNodes()       1
#define CmiNodeFirst(node)  0
#define CmiNodeOf(pe)       0
#define CmiRankOf(pe)       pe
#else
#define CmiMyNode()         _Cmi_mynode
#define CmiNumNodes()       _Cmi_numnodes
int CmiNodeFirst(int node);
int CmiNodeOf(int pe);
int CmiRankOf(int pe);
#endif

#define CMK_CPV_IS_SMP sched_yield();

extern void CmiNodeBarrier(void);
extern void CmiNodeAllBarrier(void);
#define CmiSvAlloc CmiAlloc

#if CMK_USE_LRTS
#include "lrtslock.h"
#else
#if CMK_HAS_SPINLOCK && CMK_USE_SPINLOCK
typedef pthread_spinlock_t *CmiNodeLock;
#define CmiLock(lock) (pthread_spin_lock(lock))
#define CmiUnlock(lock) (pthread_spin_unlock(lock))
#define CmiTryLock(lock) (pthread_spin_trylock(lock))
#else
typedef pthread_mutex_t *CmiNodeLock;
#define CmiLock(lock) (pthread_mutex_lock(lock))
#define CmiUnlock(lock) (pthread_mutex_unlock(lock))
#define CmiTryLock(lock) (pthread_mutex_trylock(lock))
#endif
extern CmiNodeLock CmiCreateLock(void);
extern void CmiDestroyLock(CmiNodeLock lock);
#endif // CMK_USE_LRTS

extern CmiNodeLock CmiMemLock_lock;
#define CmiMemLock() do{if (CmiMemLock_lock) CmiLock(CmiMemLock_lock);} while (0)

#define CmiMemUnlock() do{if (CmiMemLock_lock) CmiUnlock(CmiMemLock_lock);} while (0)


#if (CMK_BLUEGENEQ || CMK_PAMI_LINUX_PPC8) && CMK_ENABLE_ASYNC_PROGRESS
extern CMK_THREADLOCAL int32_t _cmi_bgq_incommthread;
#define CmiInCommThread()  (_cmi_bgq_incommthread)
#else
#define CmiInCommThread()  (CmiMyRank() == CmiMyNodeSize())
#endif

#endif /* POSIX_THREADS_SMP */

#include "string.h"

#if CMK_BLUEGENEQ && CMK_BLUEGENEQ_OPTCOPY
void CmiMemcpy_qpx (void *dst, const void *src, size_t n);
#define CmiMemcpy(_dst, _src, _n)                                        \
  do {                                                                   \
    const void *_cmimemcpy_src = (_src);                                 \
    void *_cmimemcpy_dst = (_dst);                                       \
    size_t _cmimemcpy_n = (_n);                                          \
    if ( (_cmimemcpy_n > 512+32) &&                                      \
         ((((size_t)_cmimemcpy_dst|(size_t)_cmimemcpy_src) & 0x1F)==0) ) \
      CmiMemcpy_qpx(_cmimemcpy_dst, _cmimemcpy_src, _cmimemcpy_n);       \
    else                                                                 \
      memcpy(_cmimemcpy_dst, _cmimemcpy_src, _cmimemcpy_n);              \
  } while(0)
#else
#define CmiMemcpy(dest, src, size) memcpy((dest), (src), (size))
#endif


#if CMK_SHARED_VARS_NT_THREADS /*Used only by win versions*/

#ifndef WIN32_LEAN_AND_MEAN
#define WIN32_LEAN_AND_MEAN
#endif
#ifndef NOMINMAX
#define NOMINMAX
#endif
#include <windows.h>
#include "lrtslock.h"

extern int _Cmi_numpes;
extern int _Cmi_mynodesize;
extern int _Cmi_mynode;
extern int _Cmi_numnodes;
extern int _Cmi_sleepOnIdle;
extern int _Cmi_forceSpinOnIdle;

int CmiMyPe(void);
int CmiMyRank(void);
#define CmiNumPes()         _Cmi_numpes
#define CmiMyNodeSize()     _Cmi_mynodesize
int CmiNodeSize(int node);
#if CMK_MULTICORE
#define CmiMyNode()         0
#define CmiNumNodes()       1
#define CmiNodeFirst(node)  0
#define CmiNodeOf(pe)       0
#define CmiRankOf(pe)       pe
#else
#define CmiMyNode()         _Cmi_mynode
#define CmiNumNodes()       _Cmi_numnodes
int CmiNodeFirst(int node);
int CmiNodeOf(int pe);
int CmiRankOf(int pe);
#endif

#define CMK_CPV_IS_SMP Sleep(0);

extern void CmiNodeBarrier(void);
extern void CmiNodeAllBarrier(void);
#define CmiSvAlloc CmiAlloc

extern CmiNodeLock CmiMemLock_lock;
#define CmiMemLock() do{if (CmiMemLock_lock) CmiLock(CmiMemLock_lock);} while (0)
#define CmiMemUnlock() do{if (CmiMemLock_lock) CmiUnlock(CmiMemLock_lock);} while (0)

#if CMK_SMP
#define CmiInCommThread()  (CmiMyRank() == CmiMyNodeSize())
#else
#define CmiInCommThread()  (0)
#endif

#endif /* CMK_SHARED_VARS_NT_THREADS */

#if CMK_SHARED_VARS_UNAVAILABLE /* non-SMP version */

typedef int CmiImmediateLockType;
extern int _immediateLock;
extern int _immediateFlag;
#define CmiCreateImmediateLock() (0)
#define CmiImmediateLock(ignored) { _immediateLock++; }
#if CMK_IMMEDIATE_MSG
#define CmiImmediateUnlock(ignored) \
  { _immediateLock--; \
    if(_immediateFlag) \
      CmiProbeImmediateMsg(); } 
#else
#define CmiImmediateUnlock(ignored) { _immediateLock--; }
#endif
#define CmiCheckImmediateLock(ignored) \
  ((_immediateLock)?((_immediateFlag=1),1):0)
#define CmiClearImmediateFlag() { _immediateFlag=0; }

#else /* SMP and all other weird versions */

typedef CmiNodeLock CmiImmediateLockType;
#define CmiCreateImmediateLock() CmiCreateLock()
#define CmiImmediateLock(immediateLock) CmiLock((immediateLock))
#define CmiImmediateUnlock(immediateLock) CmiUnlock((immediateLock)) 
#define CmiCheckImmediateLock(ignored)  (0)
#define CmiClearImmediateFlag() 

#endif

/* This is the default Cpv implmentation for SMP-style systems:
A Cpv variable is actually a pointer to an array of values, one
for each processor in the node.
*/
#ifdef CMK_CPV_IS_SMP

#if CMK_HAS_TLS_VARIABLES && !CMK_NOT_USE_TLS_THREAD
#define CpvDeclare(t,v) CMK_THREADLOCAL t* CMK_TAG(Cpv_,v) = NULL;   \
                        int CMK_TAG(Cpv_inited_,v) = 0;  \
                        t ** CMK_TAG(Cpv_addr_,v)
#define CpvExtern(t,v)  extern CMK_THREADLOCAL t* CMK_TAG(Cpv_,v);  \
                        extern int CMK_TAG(Cpv_inited_,v);  \
                        extern t ** CMK_TAG(Cpv_addr_,v)
#ifdef __cplusplus
#define CpvCExtern(t,v) extern "C" CMK_THREADLOCAL t* CMK_TAG(Cpv_,v);  \
                        extern "C" int CMK_TAG(Cpv_inited_,v);  \
                        extern "C" t ** CMK_TAG(Cpv_addr_,v)
#else
#define CpvCExtern(t,v)    CpvExtern(t,v)
#endif
#define CpvStaticDeclare(t,v) static CMK_THREADLOCAL t* CMK_TAG(Cpv_,v) = NULL;   \
                        static int CMK_TAG(Cpv_inited_,v) = 0;  \
                        static t ** CMK_TAG(Cpv_addr_,v)
#define CpvInitialize(t,v)\
    do {                                                               \
      CmiMemLock();                                                    \
      if (!(CMK_TAG(Cpv_inited_,v))) {                                 \
        CMK_TAG(Cpv_addr_,v) = CpvInit_Alloc(t*, 1+CmiMyNodeSize());   \
        CMK_TAG(Cpv_inited_,v) = 1;                                    \
      }                                                                \
      CmiMemUnlock();                                                  \
      CMK_TAG(Cpv_,v) = CpvInit_Alloc_scalar(t);                       \
      CMK_TAG(Cpv_addr_,v)[CmiMyRank()] = CMK_TAG(Cpv_,v);             \
    } while(0)
#define CpvInitialized(v) (0!=CMK_TAG(Cpv_,v))

#if (CMK_BLUEGENEQ || CMK_PAMI_LINUX_PPC8) && CMK_ENABLE_ASYNC_PROGRESS && CMK_IMMEDIATE_MSG
  #define CpvAccess(v) (*(CMK_TAG(Cpv_addr_,v)[CmiMyRank()]))
#else
#define CpvAccess(v) (*CMK_TAG(Cpv_,v))
#endif

#define CpvAccessOther(v, r) (*(CMK_TAG(Cpv_addr_,v)[r]))
#else

#define CpvDeclare(t,v) t* CMK_TAG(Cpv_,v)
#define CpvExtern(t,v)  extern t* CMK_TAG(Cpv_,v)
#ifdef __cplusplus
#define CpvCExtern(t,v)    extern "C" t* CMK_TAG(Cpv_,v)
#else
#define CpvCExtern(t,v)    CpvExtern(t,v)
#endif
#define CpvStaticDeclare(t,v) static t* CMK_TAG(Cpv_,v)
#define CpvInitialize(t,v)\
    do { \
       if (CmiMyRank()) { \
               CmiMemoryReadFence(); \
		       while (!CpvInitialized(v)) { CMK_CPV_IS_SMP ; CmiMemoryReadFence(); } \
       } else { \
               t* tmp = CpvInit_Alloc(t,1+CmiMyNodeSize());\
               CmiMemoryWriteFence();   \
               CMK_TAG(Cpv_,v)=tmp;   \
	       /* CMK_TAG(Cpv_,v)=CpvInit_Alloc(t,1+CmiMyNodeSize()); */\
       } \
    } while(0)
#define CpvInitialized(v) (0!=CMK_TAG(Cpv_,v))
#define CpvAccess(v) CMK_TAG(Cpv_,v)[CmiMyRank()]
#define CpvAccessOther(v, r) CMK_TAG(Cpv_,v)[r]
#endif

#endif

/*Csv are the same almost everywhere:*/
#ifndef CsvDeclare
#define CsvDeclare(t,v) t CMK_TAG(Csv_,v)
#define CsvStaticDeclare(t,v) static t CMK_TAG(Csv_,v)
#define CsvExtern(t,v) extern t CMK_TAG(Csv_,v)
#define CsvInitialize(t,v) do{}while(0)
#define CsvInitialized(v) 1
#define CsvAccess(v) CMK_TAG(Csv_,v)
#endif

extern CmiNodeLock _smp_mutex;

extern int CmiBarrier(void);
extern int CmiBarrierZero(void);

/* cpu topology */
extern int CmiNumCores(void);
extern int CmiCpuTopologyEnabled(void);
extern int CmiPeOnSamePhysicalNode(int pe1, int pe2);
extern int CmiNumPhysicalNodes(void);
extern int CmiPhysicalNodeID(int pe);
extern int CmiNumPesOnPhysicalNode(int node);
extern void CmiGetPesOnPhysicalNode(int node, int **pelist, int *num);
extern int CmiGetFirstPeOnPhysicalNode(int node);
extern int CmiPhysicalRank(int pe);
extern void CmiInitCPUAffinity(char **argv);
extern int CmiPrintCPUAffinity(void);
extern int CmiSetCPUAffinity(int core);
extern int CmiSetCPUAffinityLogical(int core);
extern void CmiInitCPUTopology(char **argv);
extern int CmiOnCore(void);

typedef struct
{
  int num_pus;
  int num_cores;
  int num_sockets;

  int total_num_pus;
} CmiHwlocTopology;

extern CmiHwlocTopology CmiHwlocTopologyLocal;

extern void CmiInitHwlocTopology(void);

/** Return 1 if our outgoing message queue 
   for this node is longer than this many bytes. */
int CmiLongSendQueue(int forNode,int longerThanBytes);

/******** CMI, CSD: MANY LOW-LEVEL OPERATIONS ********/

typedef struct {
	CmiHandlerEx hdlr;
	void *userPtr;
} CmiHandlerInfo;

#include "queueing.h" /* for "Queue" */

CpvExtern(CmiHandlerInfo*, CmiHandlerTable);
CpvExtern(int,         CmiHandlerMax);
CpvExtern(Queue,       CsdSchedQueue);
#if CMK_SMP && CMK_TASKQUEUE
CpvExtern(Queue,       CsdTaskQueue);
CpvExtern(void*,       CmiSuspendedTaskQueue);
#endif
#if CMK_GRID_QUEUE_AVAILABLE
CpvExtern(Queue,      CsdGridQueue);
#endif
#if CMK_OBJECT_QUEUE_AVAILABLE
CpvExtern(Queue,       CsdObjQueue);
#endif
#if CMK_NODE_QUEUE_AVAILABLE
CsvExtern(Queue,       CsdNodeQueue);
CsvExtern(CmiNodeLock, CsdNodeQueueLock);
#endif
CpvExtern(int,         CsdStopFlag);
CpvExtern(int,         CsdLocalCount);
#define CSD_LOCAL_MAX_DEFAULT 0

extern void CmiAssignOnce(int* variable, int value);

extern int CmiRegisterHandler(CmiHandler h);
extern int CmiRegisterHandlerEx(CmiHandlerEx h,void *userPtr);
#if CMI_LOCAL_GLOBAL_AVAILABLE
extern int CmiRegisterHandlerLocal(CmiHandler);
extern int CmiRegisterHandlerGlobal(CmiHandler);
#endif
extern void CmiNumberHandler(int n, CmiHandler h);
extern void CmiNumberHandlerEx(int n, CmiHandlerEx h,void *userPtr);

#define CmiGetHandler(m)  (((CmiMsgHeaderExt*)m)->hdl)
#define CmiGetXHandler(m) (((CmiMsgHeaderExt*)m)->xhdl)
#define CmiGetInfo(m)     (((CmiMsgHeaderExt*)m)->info)
#define CmiGetRoot(m)     (((CmiMsgHeaderExt*)m)->root)
#define CmiGetRedID(m)    (((CmiMsgHeaderExt*)m)->redID)

#define CmiSetHandler(m,v)  do {((((CmiMsgHeaderExt*)m)->hdl)=(v));} while(0)
#define CmiSetXHandler(m,v) do {((((CmiMsgHeaderExt*)m)->xhdl)=(v));} while(0)
#define CmiSetInfo(m,v)     do {((((CmiMsgHeaderExt*)m)->info)=(v));} while(0)
#define CmiSetRoot(m,v)     do {((((CmiMsgHeaderExt*)m)->root)=(v));} while(0)
#define CmiSetRedID(m,v)    do {((((CmiMsgHeaderExt*)m)->redID)=(v));} while(0)

#define CmiHandlerToInfo(n) (CpvAccess(CmiHandlerTable)[n])
#define CmiHandlerToFunction(n) (CmiHandlerToInfo(n).hdlr)
#define CmiGetHandlerInfo(env) (CmiHandlerToInfo(CmiGetHandler(env)))
#define CmiGetHandlerFunction(env) (CmiHandlerToFunction(CmiGetHandler(env)))

#if __FAULT__
CpvExtern(int, _curRestartPhase);      /* number of restarts */
#endif

#if CMK_MEM_CHECKPOINT
#undef CmiSetHandler
#define CmiSetHandler(m,v)  do {(((CmiMsgHeaderExt*)m)->hdl)=(v); (((CmiMsgHeaderExt*)m)->pn)=CpvAccess(_curRestartPhase);} while(0)
#define MESSAGE_PHASE_CHECK(msg)	\
	{	\
          int phase = CmiGetRestartPhase(msg);	\
	  if (phase != 9999 && phase < CpvAccess(_curRestartPhase)) {	\
            /* CmiPrintf("[%d] discard message of phase %d cur_restart_phase:%d. \n", CmiMyPe(), phase, cur_restart_phase); */	\
            CmiFree(msg);	\
	    return;	\
          }	\
          /* CmiAssert(phase == cur_restart_phase || phase == 9999); */ \
          if (phase > CpvAccess(_curRestartPhase) && phase != 9999) {    \
            /* CmiPrintf("[%d] enqueue message of phase %d cur_restart_phase:%d. \n", CmiMyPe(), phase, cur_restart_phase); */	\
            CsdEnqueueFifo(msg);    \
	    return;	\
          }     \
	}
#else
#define MESSAGE_PHASE_CHECK(msg)
#endif

#if defined __cplusplus

/** This header goes before each chunk of memory allocated with CmiAlloc. 
    See the comment in convcore.C for details on the fields.
*/
struct CmiChunkHeader {
  int size;
private:
#if CMK_SMP
  std::atomic<int> ref;
#else
  int ref;
#endif
#if ALIGN_BYTES > 8
  #if defined(__GNUC__) || defined(__clang__)
  #pragma GCC diagnostic push
  #pragma GCC diagnostic ignored "-Wpedantic"
  #if defined(__clang__)
  #pragma GCC diagnostic ignored "-Wunused-private-field"
  #endif
  #endif
  char align[ALIGN_BYTES
             - sizeof(int)*2
#if (CMK_USE_IBVERBS || CMK_USE_IBUD)
             - sizeof(void *)
#endif
            ];
  #if defined(__GNUC__) || defined(__clang__)
  #pragma GCC diagnostic pop
  #endif
#endif
public:
  CmiChunkHeader() = default;
  CmiChunkHeader(const CmiChunkHeader & x)
    : size{x.size}, ref{x.getRef()} { }
#if CMK_SMP
  int getRef() const
  {
    return ref.load(std::memory_order_acquire);
  }
  void setRef(int r)
  {
    return ref.store(r, std::memory_order_release);
  }
  int incRef()
  {
    return ref.fetch_add(1, std::memory_order_release);
  }
  int decRef()
  {
    return ref.fetch_sub(1, std::memory_order_release);
  }
#else
  int getRef() const { return ref; }
  void setRef(int r) { ref = r; }
  int incRef() { return ref++; }
  int decRef() { return ref--; }
#endif
};

#if CMK_USE_IBVERBS | CMK_USE_IBUD
struct infiCmiChunkMetaDataStruct;

#define CMI_INFI_CHUNK_HEADER_FIELDS \
struct infiCmiChunkMetaDataStruct *metaData;\
CmiChunkHeader chunkHeader;

struct infiCmiChunkHeaderHelper{
  CMI_INFI_CHUNK_HEADER_FIELDS
};

typedef struct infiCmiChunkHeaderStruct{
  CMI_INFI_CHUNK_HEADER_FIELDS
} infiCmiChunkHeader;

struct infiCmiChunkMetaDataStruct *registerMultiSendMesg(char *msg,int msgSize);
#endif

/* Given a user chunk m, extract the enclosing chunk header fields: */
#define BLKSTART(m) ((CmiChunkHeader *) (((intptr_t)m) - sizeof(CmiChunkHeader)))
#define SIZEFIELD(m) ((BLKSTART(m))->size)
#define REFFIELD(m) ((BLKSTART(m))->getRef())
#define REFFIELDSET(m, r) ((BLKSTART(m))->setRef(r))
#define REFFIELDINC(m) ((BLKSTART(m))->incRef())
#define REFFIELDDEC(m) ((BLKSTART(m))->decRef())

#endif

#ifdef __cplusplus
extern "C" {
#endif
void* malloc_nomigrate(size_t size);
void free_nomigrate(void* ptr);
#ifdef __cplusplus
}
#endif

/**
   Allocate `size` bytes of memory usable as a message buffer.

   Such memory may be in limited supply and expensive to obtain on
   machine layers that use registered or pinned memory when
   interacting with the communication hardware. Uses besides buffers
   in which to construct messages should prefer the malloc()/free()
   provided by libmemory-*.
*/
void    *CmiAlloc(int size);
void     CmiReference(void *blk);
int      CmiGetReference(void *blk);
int      CmiSize(void *blk);
void     CmiFree(void *blk);
void     CmiRdmaFree(void *blk);
void     CmiInitMsgHeader(void *msg, int size);

#ifndef CMI_TMP_SKIP
void *CmiTmpAlloc(int size);
void CmiTmpFree(void *);
#endif

/* Pool features */


/* Various special features of certain -memory modes: */
extern void * memory_stack_top; /* contains the top of the stack, for -memory charmdebug */
void CmiMemoryCheck(void); /* heap check, for -memory paranoid */
void CmiMemoryMark(void); /* ignore current allocations, for -memory leak */
void CmiMemoryMarkBlock(void *blk); /* ignore this allocation, for -memory leak */
void CmiMemorySweep(const char *where); /* print current allocations, for -memory leak */
CMK_TYPEDEF_UINT8 CmiMemoryUsage(void);
const char *CmiMemoryUsageReporter(void);
CMK_TYPEDEF_UINT8 CmiMaxMemoryUsageR(void);
CMK_TYPEDEF_UINT8 CmiMaxMemoryUsage(void);
void CmiResetMaxMemory(void);
CMK_TYPEDEF_UINT8 CmiMinMemoryUsage(void);
void CmiResetMinMemory(void);

/* General functions for malloc'ing aligned buffers */
#define CmiRoundUpToPow2(s, p2)  (s + ((p2 - (s & (p2 - 1))) & (p2 - 1)))
void* CmiMallocAligned(const size_t size, const unsigned int alignment);
void CmiFreeAligned(void* ptr);

#define CMI_MEMORY_IS_ISOMALLOC   (1<<1)
#define CMI_MEMORY_IS_PARANOID    (1<<2)
#define CMI_MEMORY_IS_GNU         (1<<3)
#define CMI_MEMORY_IS_GNUOLD      (1<<4)
#define CMI_MEMORY_IS_OS          (1<<5)
#define CMI_MEMORY_IS_CHARMDEBUG  (1<<6)
int CmiMemoryIs(int flag); /* return state of this flag */

#define CMI_THREAD_IS_QT         (1<<1)
#define CMI_THREAD_IS_CONTEXT    (1<<2)
#define CMI_THREAD_IS_UJCONTEXT  (1<<3)
#define CMI_THREAD_IS_PTHREADS   (1<<4)
#define CMI_THREAD_IS_FIBERS     (1<<5)
#define CMI_THREAD_IS_ALIAS      (1<<6)
#define CMI_THREAD_IS_STACKCOPY  (1<<7)
#define CMI_THREAD_IS_TLS        (1<<8)
int CmiThreadIs(int flag); /* return state of this flag */

void CmiMkdir(const char *dirName);
int CmiGetPageSize(void);

double   CmiCpuTimer(void);

#if CMK_TIMER_USE_RDTSC 
#ifndef __x86_64__
# if !CMK_GCC_X86_ASM
/* Can't use rdtsc unless we have x86 assembly: */
#  undef CMK_TIMER_USE_RDTSC
#  undef CMK_TIMER_USE_GETRUSAGE
#  define CMK_TIMER_USE_RDTSC 0
#  define CMK_TIMER_USE_GETRUSAGE 1
# endif
#endif
#endif

#if CMK_TIMER_USE_RDTSC 
extern double _cpu_speed_factor;

static __inline__ unsigned long long int rdtsc(void)
{
        unsigned long long int x;
#ifdef __x86_64__
	/* taken from papi code ("perfctr-p3.c") for machines like opteron */
        do {
          unsigned int a,d;
          asm volatile("rdtsc" : "=a" (a), "=d" (d));
          (x) = ((unsigned long)a) | (((unsigned long)d)<<32);
        } while(0);
#elif CMK_GCC_X86_ASM
        __asm__ volatile (".byte 0x0f, 0x31" : "=A" (x));
#else
#  error "Unknown assembly format-- can't use CMK_TIMER_USE_RDTSC."
#endif
        return x;
}

#define CmiWallTimer() ((double)rdtsc()*(_cpu_speed_factor))
#define CmiTimer CmiCpuTimer
double   CmiStartTimer(void);
double   CmiInitTime(void);
#define CmiTimerIsSynchronized()	(0)
#define CmiTimerAbsolute()              (0)

#else
void     CmiTimerInit(char **argv);
int      CmiTimerAbsolute(void);
double   CmiStartTimer(void);
double   CmiInitTime(void);
double   CmiTimer(void);
double   CmiWallTimer(void);
int      CmiTimerIsSynchronized(void);
#endif

char *CmiPrintDate(void);

#if CMK_NODE_QUEUE_AVAILABLE

#define CsdNodeEnqueueGeneral(x,s,i,p) do { \
          CmiLock(CsvAccess(CsdNodeQueueLock));\
          CqsEnqueueGeneral((Queue)CsvAccess(CsdNodeQueue),(x),(s),(i),(p)); \
          CmiUnlock(CsvAccess(CsdNodeQueueLock)); \
        } while(0)
#define CsdNodeEnqueueFifo(x)     do { \
          CmiLock(CsvAccess(CsdNodeQueueLock));\
          CqsEnqueueFifo((Queue)CsvAccess(CsdNodeQueue),(x)); \
          CmiUnlock(CsvAccess(CsdNodeQueueLock)); \
        } while(0)
#define CsdNodeEnqueueLifo(x)     do { \
          CmiLock(CsvAccess(CsdNodeQueueLock));\
          CqsEnqueueLifo((Queue)CsvAccess(CsdNodeQueue),(x))); \
          CmiUnlock(CsvAccess(CsdNodeQueueLock)); \
        } while(0)
#define CsdNodeEnqueue(x)     do { \
          CmiLock(CsvAccess(CsdNodeQueueLock));\
          CqsEnqueueFifo((Queue)CsvAccess(CsdNodeQueue),(x));\
          CmiUnlock(CsvAccess(CsdNodeQueueLock)); \
        } while(0)

#define CsdNodeEmpty()            (CqsEmpty(C(Queue)pvAccess(CsdNodeQueue)))
#define CsdNodeLength()           (CqsLength((Queue)CpvAccess(CsdNodeQueue)))

#else

#define CsdNodeEnqueueGeneral(x,s,i,p) (CsdEnqueueGeneral(x,s,i,p))
#define CsdNodeEnqueueFifo(x) (CqsEnqueueFifo((Queue)CpvAccess(CsdSchedQueue),(x)))
#define CsdNodeEnqueueLifo(x) (CqsEnqueueLifo((Queue)CpvAccess(CsdSchedQueue),(x)))
#define CsdNodeEnqueue(x)     (CsdEnqueue(x))
#define CsdNodeEmpty()        (CqsEmpty((Queue)CpvAccess(CsdSchedQueue)))
#define CsdNodeLength()       (CqsLength((Queue)CpvAccess(CsdSchedQueue)))

#endif

#define CsdEnqueueGeneral(x,s,i,p)\
    (CqsEnqueueGeneral((Queue)CpvAccess(CsdSchedQueue),(x),(s),(i),(p)))
#define CsdEnqueueFifo(x)     (CqsEnqueueFifo((Queue)CpvAccess(CsdSchedQueue),(x)))
#define CsdEnqueueLifo(x)     (CqsEnqueueLifo((Queue)CpvAccess(CsdSchedQueue),(x)))
#define CsdEnqueue(x)         (CqsEnqueueFifo((Queue)CpvAccess(CsdSchedQueue),(x)))
#define CsdEmpty()            (CqsEmpty((Queue)CpvAccess(CsdSchedQueue)))
#define CsdLength()           (CqsLength((Queue)CpvAccess(CsdSchedQueue)))
#if CMK_CMIPRINTF_IS_A_BUILTIN /* these are implemented in machine.C */
#if defined __GNUC__ || defined __clang__
__attribute__ ((format (printf, 1, 2)))
#endif
void  CmiPrintf(const char *, ...);
#if defined __GNUC__ || defined __clang__
__attribute__ ((format (printf, 1, 2)))
#endif
void  CmiError(const char *, ...);
#if defined __GNUC__ || defined __clang__
__attribute__ ((format (scanf, 1, 2)))
#endif
int   CmiScanf(const char *, ...);
/* CmiFlush is disabled in this case */
#define CmiFlush(stream) 

#else /* standard definitions */

#include <stdio.h>

/*
 * I made vprintf functions for CmiPrintf and CmiError, but on the
 * O2K, there is no equivalent vscanf!

 #define CmiPrintf printf
 #define CmiError  printf
*/
#include <stdarg.h>

#if defined __GNUC__ || defined __clang__
__attribute__ ((format (printf, 1, 2)))
#endif
void  CmiPrintf(const char *format, ...);
#if defined __GNUC__ || defined __clang__
__attribute__ ((format (printf, 1, 2)))
#endif
void  CmiError(const char *format, ...);
/* CmiFlush works only when CMK_CMIPRINTF_IS_A_BUILTIN is false */
#define CmiFlush(stream)  fflush(stream);
#define CmiScanf  scanf

#endif

#if defined(__STDC__) || defined(__cplusplus)
#define __CMK_STRING(x) #x
#else
#define __CMK_STRING(x) "x"
#endif

#define __CMK_XSTRING(x) __CMK_STRING(x)

extern void __cmi_assert(const char *);
#define CmiEnforce(expr) \
  ((void) ((expr) ? 0 :                   \
     (__cmi_assert ("Assertion \"" __CMK_STRING(expr) \
                    "\" failed in file " __FILE__ \
                    " line " __CMK_XSTRING(__LINE__) "."), 0)))

#if ! CMK_ERROR_CHECKING
#define CmiAssert(expr) ((void) 0)
#else
#define CmiAssert(expr) \
  ((void) ((expr) ? 0 :                   \
     (__cmi_assert ("Assertion \"" __CMK_STRING(expr) \
                    "\" failed in file " __FILE__ \
                    " line " __CMK_XSTRING(__LINE__) "."), 0)))
#endif

typedef void (*CmiStartFn)(int argc, char **argv);

/********* CSD - THE SCHEDULER ********
  @addtogroup ConverseScheduler
  @{
*/
CpvExtern(int, _ccd_numchecks);
extern void  CcdCallBacks(void);
#define CsdPeriodic() do{ if (CpvAccess(_ccd_numchecks)-- <= 0) CcdCallBacks(); } while(0)
#define CsdResetPeriodic()    CpvAccess(_ccd_numchecks) = 0;

extern void  CsdEndIdle(void);
extern void  CsdStillIdle(void);
extern void  CsdBeginIdle(void);

typedef struct {
  void *localQ;
  Queue nodeQ;
  Queue schedQ;
  int *localCounter;
#if CMK_OBJECT_QUEUE_AVAILABLE
  Queue objQ;
#endif
  CmiNodeLock nodeLock;
#if CMK_GRID_QUEUE_AVAILABLE
  Queue gridQ;
#endif
#if CMK_SMP && CMK_TASKQUEUE
  Queue taskQ;
  void *suspendedTaskQ;
#endif
} CsdSchedulerState_t;
extern void CsdSchedulerState_new(CsdSchedulerState_t *state);
extern void *CsdNextMessage(CsdSchedulerState_t *state);
extern void *CsdNextLocalNodeMessage(CsdSchedulerState_t *state);

extern void  *CmiGetNonLocal(void);
extern void   CmiNotifyIdle(void);

/*Different kinds of schedulers: generic, eternal, counting, polling*/
extern  int CsdScheduler(int maxmsgs);
extern void CsdScheduleForever(void);
extern  int CsdScheduleCount(int maxmsgs);
extern void CsdSchedulePoll(void);
extern void CsdScheduleNodePoll(void);

#define CsdExitScheduler()  (CpvAccess(CsdStopFlag)++)
/** @} */

#if CMK_SPANTREE_USE_COMMON_CODE

#define CST_W  (CMK_SPANTREE_MAXSPAN)
#define CST_NN (CmiNumNodes())
#define CmiNodeSpanTreeParent(n) ((n)?(((n)-1)/CST_W):(-1))
#define CmiNodeSpanTreeChildren(n,c) do {\
          int _i; \
          for(_i=0; _i<CST_W; _i++) { \
            int _x = (n)*CST_W+_i+1; \
            if(_x<CST_NN) (c)[_i]=_x; \
          }\
        } while(0)
#define CmiNumNodeSpanTreeChildren(n) ((((n)+1)*CST_W<CST_NN)? CST_W : \
          ((((n)*CST_W+1)>=CST_NN)?0:((CST_NN-1)-(n)*CST_W)))
#define CST_R(p) (CmiRankOf(p))
#define CST_NF(n) (CmiNodeFirst(n))
#define CST_SP(n) (CmiNodeSpanTreeParent(n))
#define CST_ND(p) (CmiNodeOf(p))
#define CST_NS(p) (CmiNodeSize(CST_ND(p)))
#define CmiSpanTreeParent(p) ((p)?(CST_R(p)?(CST_NF(CST_ND(p))+(CST_R(p)-1)/CST_W):CST_NF(CST_SP(CST_ND(p)))):(-1))
#define CST_C(p) (((CST_R(p)+1)*CST_W<CST_NS(p))?CST_W:(((CST_R(p)*CST_W+1)>=CST_NS(p))?0:((CST_NS(p)-1)-CST_R(p)*CST_W)))
#define CST_SC(p) (CmiNumNodeSpanTreeChildren(CST_ND(p)))
#define CmiNumSpanTreeChildren(p) (CST_R(p)?CST_C(p):(CST_SC(p)+CST_C(p)))
#define CmiSpanTreeChildren(p,c) do {\
          int _i,_c=0; \
          if(CST_R(p)==0) { \
            for(_i=0;_i<CST_W;_i++) { \
              int _x = CST_ND(p)*CST_W+_i+1; \
              if(_x<CST_NN) (c)[_c++]=CST_NF(_x); \
            }\
          } \
          for(_i=0;_i<CST_W;_i++) { \
            int _x = CST_R(p)*CST_W+_i+1; \
            if(_x<CST_NS(p)) (c)[_c++]=CST_NF(CST_ND(p))+_x; \
          }\
        } while(0)

#else

int      CmiNumSpanTreeChildren(int) ;
int      CmiSpanTreeParent(int) ;
void     CmiSpanTreeChildren(int node, int *children);
int      CmiNumNodeSpanTreeChildren(int);
int      CmiNodeSpanTreeParent(int) ;
void     CmiNodeSpanTreeChildren(int node, int *children) ;
#endif

/****** MULTICAST GROUPS ******/

typedef CMK_MULTICAST_GROUP_TYPE CmiGroup;

void     CmiGroupInit(void);
CmiGroup CmiEstablishGroup(int npes, int *pes);
void     CmiLookupGroup(CmiGroup grp, int *npes, int **pes);

/****** CMI MESSAGE TRANSMISSION ******/

/* Utility function: it packs a multiple chunk message into a singly one.
 * Receives the two return values outsize and outdata to return the size of the
 * composed message and the message itself. It receives the number of chunks to
 * compact, their sizes and datas. The types of the parameters are:
 *
 * int outsize, inndata, insizes;
 * char *outdata;
 * char **indatas;
 *
 * If inndata is negative, it means that the messages need to be copied together
 * with their memory header at the beginning (message nesting), and padded to 8
 * bytes. The first message never has its memory header attached (it uses the
 * one of the new message).
 */

#define VECTOR_COMPACT(outsize,outdata,inndata,insizes,indatas,chunkHeaderSize) {\
  int i;\
  char *tmp;\
  outsize=0;\
  if (inndata>=0) for(i=0; i<inndata; ++i) outsize += insizes[i];\
  else {\
    for(i=0; i<-inndata; ++i) outsize += ALIGN_DEFAULT(insizes[i]);\
    outsize -= (inndata+1) * chunkHeaderSize;\
  }\
  outdata = (char *)CmiAlloc(outsize);\
  if (!outdata) fprintf(stderr, "%d: Out of mem\n", CmiMyNode());\
  tmp = outdata;\
  if (inndata>=0) {\
    for (i=0; i<inndata; ++i) {\
      memcpy(tmp, indatas[i], insizes[i]);\
      tmp += insizes[i];\
    }\
  } else {\
    memcpy(tmp, indatas[0], insizes[0]);\
    tmp += ALIGN_DEFAULT(insizes[0]);\
    for (i=0; i<-inndata; ++i) {\
      memcpy(tmp, indatas[i]-chunkHeaderSize, insizes[i]+chunkHeaderSize);\
      tmp += ALIGN_DEFAULT(insizes[i])+chunkHeaderSize;\
    }\
  }\
}

void CmiPushPE(int, void*);
#if CMK_OMP
void          CmiSuspendedTaskEnqueue(int targetRank, void *msg);
void      *   CmiSuspendedTaskPop(void);
#endif
void          CmiSyncSendFn(int, int, char *);
CmiCommHandle CmiAsyncSendFn(int, int, char *);
void          CmiFreeSendFn(int, int, char *);

void          CmiSyncBroadcastFn(int, char *);
CmiCommHandle CmiAsyncBroadcastFn(int, char *);
void          CmiFreeBroadcastFn(int, char *);

void          CmiSyncBroadcastAllFn(int, char *);
CmiCommHandle CmiAsyncBroadcastAllFn(int, char *);
void          CmiFreeBroadcastAllFn(int, char *);

void          CmiWithinNodeBroadcastFn(int, char*);

void          CmiSyncListSendFn(int, const int *, int, char*);
CmiCommHandle CmiAsyncListSendFn(int, const int *, int, char*);
void          CmiFreeListSendFn(int, const int *, int, char*);
void          CmiFreeNodeListSendFn(int, const int *, int, char*);

void          CmiSyncMulticastFn(CmiGroup, int, char*);
CmiCommHandle CmiAsyncMulticastFn(CmiGroup, int, char*);
void          CmiFreeMulticastFn(CmiGroup, int, char*);

/* inter partition send counterparts */
void          CmiInterSyncSendFn(int, int, int, char *);
void          CmiInterFreeSendFn(int, int, int, char *);

typedef void * (*CmiReduceMergeFn)(int*,void*,void**,int);
typedef void (*CmiReducePupFn)(void*,void*);
typedef void (*CmiReduceDeleteFn)(void*);

typedef struct {
  void *localData;
  char **remoteData;
  int localSize;
  short int numRemoteReceived;
  short int numChildren;
  int parent;
  CmiUInt2 seqID;
  char localContributed;
  struct {
    CmiHandler destination;
    CmiReduceMergeFn mergeFn;
    CmiReducePupFn pupFn;
    CmiReduceDeleteFn deleteFn;
  } ops;
} CmiReduction;

typedef CmiUInt2 CmiReductionID;

void * CmiReduceMergeFn_random(int*, void*, void**, int);

void CmiReduce(void *msg, int size, CmiReduceMergeFn mergeFn);
void CmiReduceStruct(void *data, CmiReducePupFn pupFn,
                     CmiReduceMergeFn mergeFn, CmiHandler dest,
                     CmiReduceDeleteFn deleteFn);
void CmiReduceID(void *msg, int size, CmiReduceMergeFn mergeFn, CmiReductionID id);
void CmiReduceStructID(void *data, CmiReducePupFn pupFn,
                     CmiReduceMergeFn mergeFn, CmiHandler dest,
                     CmiReduceDeleteFn deleteFn, CmiReductionID id);
void CmiListReduce(int npes, int *pes, void *msg, int size, CmiReduceMergeFn mergeFn, CmiReductionID id);
void CmiListReduceStruct(int npes, int *pes,
                     void *data, CmiReducePupFn pupFn,
                     CmiReduceMergeFn mergeFn, CmiHandler dest,
                     CmiReduceDeleteFn deleteFn, CmiReductionID id);
void CmiGroupReduce(CmiGroup grp, void *msg, int size, CmiReduceMergeFn mergeFn, CmiReductionID id);
void CmiGroupReduceStruct(CmiGroup grp, void *data, CmiReducePupFn pupFn,
                     CmiReduceMergeFn mergeFn, CmiHandler dest,
                     CmiReduceDeleteFn deleteFn, CmiReductionID id);
void CmiNodeReduce(void *msg, int size, CmiReduceMergeFn mergeFn);
void CmiNodeReduceStruct(void *data, CmiReducePupFn pupFn,
                         CmiReduceMergeFn mergeFn, CmiHandler dest,
                         CmiReduceDeleteFn deleteFn);
void CmiNodeReduceID(void *msg, int size, CmiReduceMergeFn mergeFn, CmiReductionID id);
void CmiNodeReduceStructID(void *data, CmiReducePupFn pupFn,
                           CmiReduceMergeFn mergeFn, CmiHandler dest,
                           CmiReduceDeleteFn deleteFn, CmiReductionID id);
int CmiGetReductionHandler(void);
CmiHandler CmiGetReductionDestination(void);
CmiReductionID CmiGetGlobalReduction(void);
CmiReductionID CmiGetGlobalNodeReduction(void);
CmiReductionID CmiGetDynamicReduction(void);
CmiReductionID CmiGetDynamicNodeReduction(void);
void CmiGetDynamicReductionRemote(int handlerIdx, int pe, int dataSize, void *data);
void CmiGetDynamicNodeReductionRemote(int handlerIdx, int node, int dataSize, void *data);

void CmiResetGlobalReduceSeqID(void);
void CmiResetGlobalNodeReduceSeqID(void);

/* If the second parameter (the number of chunks to send) is negative, then
 * every message will be started aligned with 8 bytes, and a message header will
 * be preponed to every message (message nesting), except the first one which
 * uses that of the entire message.
 */
void          CmiSyncVectorSend(int, int, int *, char **);
CmiCommHandle CmiAsyncVectorSend(int, int, int *, char **);
void          CmiSyncVectorSendAndFree(int, int, int *, char **);

void	      CmiMultipleSend(unsigned int, int, int *, char **);
void	      CmiMultipleIsend(unsigned int, int, int *, char **);

int           CmiAsyncMsgSent(CmiCommHandle);
void          CmiReleaseCommHandle(CmiCommHandle);

#define CmiSyncSend(p,s,m)              (CmiSyncSendFn((p),(s),(char *)(m)))
#define CmiAsyncSend(p,s,m)             (CmiAsyncSendFn((p),(s),(char *)(m)))
#define CmiSyncSendAndFree(p,s,m)       (CmiFreeSendFn((p),(s),(char *)(m)))

#define CmiSyncBroadcast(s,m)           (CmiSyncBroadcastFn((s),(char *)(m)))
#define CmiAsyncBroadcast(s,m)          (CmiAsyncBroadcastFn((s),(char *)(m)))
#define CmiSyncBroadcastAndFree(s,m)    (CmiFreeBroadcastFn((s),(char *)(m)))

#define CmiSyncBroadcastAll(s,m)        (CmiSyncBroadcastAllFn((s),(char *)(m)))
#define CmiAsyncBroadcastAll(s,m)       (CmiAsyncBroadcastAllFn((s),(char *)(m)))
#define CmiSyncBroadcastAllAndFree(s,m) (CmiFreeBroadcastAllFn((s),(char *)(m)))

#define CmiSyncListSend(n,l,s,m)        (CmiSyncListSendFn((n),(l),(s),(char *)(m)))
#define CmiAsyncListSend(n,l,s,m)       (CmiAsyncListSendFn((n),(l),(s),(char *)(m)))
#define CmiSyncListSendAndFree(n,l,s,m) (CmiFreeListSendFn((n),(l),(s),(char *)(m)))

#define CmiSyncMulticast(g,s,m)         (CmiSyncMulticastFn((g),(s),(char*)(m)))
#define CmiAsyncMulticast(g,s,m)        (CmiAsyncMulticastFn((g),(s),(char*)(m)))
#define CmiSyncMulticastAndFree(g,s,m)  (CmiFreeMulticastFn((g),(s),(char*)(m)))


/* adding functions for inter-partition communication - only the sync ones because */
/* we do not use the async ones */
#if CMK_HAS_PARTITION
#define CmiInterSyncSend(pe,p,s,m)              (CmiInterSyncSendFn((pe),(p),(s),(char *)(m)))
#define CmiInterSyncSendAndFree(pe,p,s,m)       (CmiInterFreeSendFn((pe),(p),(s),(char *)(m)))
#else
#define CmiInterSyncSend(pe,p,s,m)              (CmiSyncSendFn((pe),(s),(char *)(m)))
#define CmiInterSyncSendAndFree(pe,p,s,m)       (CmiFreeSendFn((pe),(s),(char *)(m)))
#endif

/* support for rest may come later if required */

#if CMK_NODE_QUEUE_AVAILABLE
void          CmiSyncNodeSendFn(int, int, char *);
CmiCommHandle CmiAsyncNodeSendFn(int, int, char *);
void          CmiFreeNodeSendFn(int, int, char *);

void          CmiSyncNodeBroadcastFn(int, char *);
CmiCommHandle CmiAsyncNodeBroadcastFn(int, char *);
void          CmiFreeNodeBroadcastFn(int, char *);

void          CmiSyncNodeBroadcastAllFn(int, char *);
CmiCommHandle CmiAsyncNodeBroadcastAllFn(int, char *);
void          CmiFreeNodeBroadcastAllFn(int, char *);

/* if node queue is available, adding inter partition counterparts */
void          CmiInterSyncNodeSendFn(int, int, int, char *);
void          CmiInterFreeNodeSendFn(int, int, int, char *);
#endif

#if CMK_NODE_QUEUE_AVAILABLE
#define CmiSyncNodeSend(p,s,m)          (CmiSyncNodeSendFn((p),(s),(char *)(m)))
#define CmiAsyncNodeSend(p,s,m)             (CmiAsyncNodeSendFn((p),(s),(char *)(m)))
#define CmiSyncNodeSendAndFree(p,s,m)       (CmiFreeNodeSendFn((p),(s),(char *)(m)))
#define CmiSyncNodeBroadcast(s,m)           (CmiSyncNodeBroadcastFn((s),(char *)(m)))
#define CmiAsyncNodeBroadcast(s,m)          (CmiAsyncNodeBroadcastFn((s),(char *)(m)))
#define CmiSyncNodeBroadcastAndFree(s,m)    (CmiFreeNodeBroadcastFn((s),(char *)(m)))
#define CmiSyncNodeBroadcastAll(s,m)        (CmiSyncNodeBroadcastAllFn((s),(char *)(m)))
#define CmiAsyncNodeBroadcastAll(s,m)       (CmiAsyncNodeBroadcastAllFn((s),(char *)(m)))
#define CmiSyncNodeBroadcastAllAndFree(s,m) (CmiFreeNodeBroadcastAllFn((s),(char *)(m)))
#define CmiWithinNodeBroadcast(s,m)         (CmiWithinNodeBroadcastFn((s),(char *)(m)))

/* counterparts of inter partition */
#if CMK_HAS_PARTITION
#define CmiInterSyncNodeSend(pe,p,s,m)         (CmiInterSyncNodeSendFn((pe),(p),(s),(char *)(m)))
#define CmiInterSyncNodeSendAndFree(pe,p,s,m)  (CmiInterFreeNodeSendFn((pe),(p),(s),(char *)(m)))
#else 
#define CmiInterSyncNodeSend(pe,p,s,m)         (CmiSyncNodeSendFn((pe),(s),(char *)(m)))
#define CmiInterSyncNodeSendAndFree(pe,p,s,m)  (CmiFreeNodeSendFn((pe),(s),(char *)(m)))
#endif

#else

#define CmiSyncNodeSend(n,s,m)        CmiSyncSend(CmiNodeFirst(n),s,m)
#define CmiAsyncNodeSend(n,s,m)       CmiAsyncSend(CmiNodeFirst(n),s,m)
#define CmiSyncNodeSendAndFree(n,s,m) CmiSyncSendAndFree(CmiNodeFirst(n),s,m)
#if CMK_MULTICORE
#define CmiSyncNodeBroadcast(s,m)           do { \
          int _i; \
          for(_i=0; _i<CmiNumNodes(); _i++) \
            if(_i != CmiMyNode()) \
              CmiSyncSend(CmiNodeFirst(_i),s,m); \
        } while(0)
#define CmiAsyncNodeBroadcast(s,m)          CmiSyncNodeBroadcast(s,m)
#define CmiSyncNodeBroadcastAndFree(s,m)    do { \
          CmiSyncNodeBroadcast(s,m); \
          CmiFree(m); \
        } while(0)
#define CmiSyncNodeBroadcastAll(s,m)           do { \
          int _i; \
          for(_i=0; _i<CmiNumNodes(); _i++) \
            CmiSyncSend(CmiNodeFirst(_i),s,m); \
        } while(0)
#define CmiAsyncNodeBroadcastAll(s,m)       CmiSyncNodeBroadcastAll(s,m)
#define CmiSyncNodeBroadcastAllAndFree(s,m) do { \
          CmiSyncNodeBroadcastAll(s,m); \
          CmiFree(m); \
        } while(0)
#define CmiWithinNodeBroadcast(s,m)         (CmiWithinNodeBroadcastFn((s),(char *)(m)))
#else
#define CmiSyncNodeBroadcast(s,m)           CmiSyncBroadcast(s,m)
#define CmiAsyncNodeBroadcast(s,m)          CmiAsyncBroadcast(s,m)
#define CmiSyncNodeBroadcastAndFree(s,m)    CmiSyncBroadcastAndFree(s,m)
#define CmiSyncNodeBroadcastAll(s,m)        CmiSyncBroadcastAll(s,m)
#define CmiAsyncNodeBroadcastAll(s,m)       CmiAsyncBroadcastAll(s,m)
#define CmiSyncNodeBroadcastAllAndFree(s,m) CmiSyncBroadcastAllAndFree(s,m)
#define CmiWithinNodeBroadcast(s,m)         CmiSyncSendAndFree(CmiMyPe(),s,m)
#endif
/* and the inter partition counterparts */
#if CMK_HAS_PARTITION
#define CmiInterSyncNodeSend(n,p,s,m)          CmiInterSyncSend(CmiNodeFirst(n),p,s,m)
#define CmiInterSyncNodeSendAndFree(n,p,s,m)   CmiInterSyncSendAndFree(CmiNodeFirst(n),p,s,m)
#else
#define CmiInterSyncNodeSend(n,p,s,m)          CmiSyncSend(CmiNodeFirst(n),s,m)
#define CmiInterSyncNodeSendAndFree(n,p,s,m)   CmiSyncSendAndFree(CmiNodeFirst(n),s,m)
#endif
#endif

/******** CMI MESSAGE RECEPTION ********/

void   CmiDeliversInit(void);
int    CmiDeliverMsgs(int maxmsgs);
void   CmiDeliverSpecificMsg(int handler);
void   CmiHandleMessage(void *msg);

/******** CQS: THE QUEUEING SYSTEM ********
 * @addtogroup CharmScheduler
 * @{ 
 */
#define CQS_QUEUEING_FIFO 2
#define CQS_QUEUEING_LIFO 3
#define CQS_QUEUEING_IFIFO 4
#define CQS_QUEUEING_ILIFO 5
#define CQS_QUEUEING_BFIFO 6
#define CQS_QUEUEING_BLIFO 7
#define CQS_QUEUEING_LFIFO 8
#define CQS_QUEUEING_LLIFO 9
/** @} */


/****** CTH: THE LOW-LEVEL THREADS PACKAGE ******/

typedef struct CthThreadStruct *CthThread;
typedef struct {
  /*Start with a message header so threads can be enqueued 
    as messages (e.g., by CthEnqueueNormalThread in convcore.C)
  */
  char cmicore[CmiReservedHeaderSize];
  CthThread thread;
  int serialNo;
} CthThreadToken;

CthThreadToken *CthGetToken(CthThread);

typedef void        (*CthVoidFn)(void *);
typedef void        (*CthAwkFn)(CthThreadToken *,int,
				int prioBits,unsigned int *prioptr);
typedef CthThread   (*CthThFn)(void);

void       CthSetSerialNo(CthThread t, int no);
int        CthImplemented(void);

CthThread  CthSelf(void);
CthThread  CthCreate(CthVoidFn, void *, int);
void       CthResume(CthThread);
void       CthFree(CthThread);

void       CthSetSuspendable(CthThread, int);
int        CthIsSuspendable(CthThread);

// For debugging
void	   CthPrintThdMagic(CthThread); 
void 	   CthPrintThdStack(CthThread);

void       CthSuspend(void);
void       CthAwaken(CthThread);
void       CthAwakenPrio(CthThread, int, int, unsigned int *);
void       CthSetStrategy(CthThread, CthAwkFn, CthThFn);
void       CthSetStrategyDefault(CthThread);
#if CMK_OMP
void       CthSetStrategyWorkStealing(CthThread);
void       CthSetStrategySuspendedWorkStealing(CthThread);
int        CthScheduled(CthThread t);
void       CthScheduledDecrement(void);
CthThread  CthGetCurrentThread(void);
CpvExtern(int, prevGtid);
void       CthSetPrev(CthThread t, CthThread prev);
#endif
void       CthYield(void);
void       CthYieldPrio(int,int,unsigned int*);

void       CthSetNext(CthThread t, CthThread next);
CthThread  CthGetNext(CthThread t);
#if CMK_TRACE_ENABLED
void CthSetEventInfo(CthThread t, int event, int srcPE);
#endif
void       CthSwitchThread(CthThread t);

size_t     CthStackOffset(CthThread t, char *p);
char     * CthPointer(CthThread t, size_t pos);

/* unimplemented: preemptive threads */
void       CthAutoYield(CthThread t, int flag);
double     CthAutoYieldFreq(CthThread t);
void       CthAutoYieldBlock(void);
void       CthAutoYieldUnblock(void);

/* Converse Thread Global (Ctg) global variable manipulation */
typedef struct CtgGlobalStruct {
  /* Pointer to our global data segment. */
  void * data_seg;
} CtgGlobals;

/** Initialize the globals support (called on each processor). */
void CtgInit(void);

/** PIC method used. **/
#define CMI_PIC_NOP     0
#define CMI_PIC_ELFGOT  1
CpvExtern(int, CmiPICMethod);

/** Copy the current globals into this new set */
size_t CtgGetSize(void);
CtgGlobals CtgCreate(void * buf);
/** Install this set of globals. */
void CtgInstall(CtgGlobals g);
void CtgUninstall(void);
/** Return the current global list */
CtgGlobals CtgCurrentGlobals(void);

void CthInterceptionsDeactivatePush(CthThread th);
void CthInterceptionsDeactivatePop(CthThread th);
int  CthInterceptionsTemporarilyActivateStart(CthThread th);
void CthInterceptionsTemporarilyActivateEnd(CthThread th, int old);

/* The thread listener structure. The user must register one such listener
	if he wants to find out when a thread is suspended or when it starts running
	or gets destroyed. It can be used for tracing etc.
*/

struct CthThreadListener;

typedef void (*CthThreadListener_suspend)(struct CthThreadListener *l);
typedef void (*CthThreadListener_resume)(struct CthThreadListener *l);
typedef void (*CthThreadListener_free)(struct CthThreadListener *l);

struct CthThreadListener {
       /** This thread is about to block. */
       CthThreadListener_suspend suspend;

       /** This thread is about to begin execution after blocking. */
       CthThreadListener_resume resume;

       /** This thread is being destroyed.  */
       CthThreadListener_free free;

       /** Pointer to listener-specific data (if needed).
           Set by listener.
       */
       void *data;

       /** Pointer to the thread this listener controls.
           Set by CthAddListener.
        */
       CthThread thread;

       /** The next listener, or NULL at end of chain.
           Set by CthAddListener, and used only by threads.C.
       */
       struct CthThreadListener *next;
};

/**
  This listener is about to begin receiving suspend and
resume events for this thread.  "suspend", "resume", "free",
and "data" fields must all already be set.  When this thread
exits, l->free will be called on this listener, which should
deallocate the listener memory.
*/
void CthAddListener(CthThread th,struct CthThreadListener *l);

/**
  This is function is inserted into .def.h after thread creation.
  It serves as an interface for the user to add the listeners 
  as needed. User has to implement this somewhere in the system.
*/
void CthUserAddListeners(CthThread th);

/****** CTH: THREAD-PRIVATE VARIABLES ******/

#if CMK_THREADS_REQUIRE_NO_CPV

#define CthCpvDeclare(t,v)    t v
#define CthCpvExtern(t,v)     extern t v
#define CthCpvStatic(t,v)     static t v
#define CthCpvInitialize(t,v) do {} while(0)
#define CthCpvAccess(x)       x

#else

#define CthCpvDeclare(t,v)    CpvDeclare(t,v)
#define CthCpvExtern(t,v)     CpvExtern(t,v)
#define CthCpvStatic(t,v)     CpvStaticDeclare(t,v)
#define CthCpvInitialize(t,v) CpvInitialize(t,v)
#define CthCpvAccess(x)       CpvAccess(x)

#endif

CthCpvExtern(char *,CthData);
extern size_t CthRegister(size_t dataSize);
extern void CthRegistered(size_t dataOffMax);
extern char *CthGetData(CthThread t);

#define CtvDeclare(t,v)         typedef t CtvType##v; CsvDeclare(int,CtvOffs##v)=(-1)
#define CtvStaticDeclare(t,v)   typedef t CtvType##v; CsvStaticDeclare(int,CtvOffs##v)=(-1)
#define CtvExtern(t,v)          typedef t CtvType##v; CsvExtern(int,CtvOffs##v)
#define CtvAccess(v)            (*((CtvType##v *)(CthCpvAccess(CthData)+CsvAccess(CtvOffs##v))))
#define CtvAccessOther(t,v)            (*((CtvType##v *)(CthGetData(t)+CsvAccess(CtvOffs##v))))
#define CtvInitialize(t,v)      do { \
	if(CsvAccess(CtvOffs##v)==(-1)) \
		CsvAccess(CtvOffs##v)=CthRegister(sizeof(CtvType##v));\
	else CthRegistered(CsvAccess(CtvOffs##v)+sizeof(CtvType##v));\
} while(0)

#define CtvInitialized(v) (CsvAccess(CtvOffs##v)!=(-1))

/****** CFUTURE: CONVERSE FUTURES ******/

typedef struct Cfuture_s
{
  int pe;
  struct Cfuture_data_s *data;
}
Cfuture;

#define CfutureValueData(v) ((void*)((v)->rest))

Cfuture       CfutureCreate(void);
void          CfutureSet(Cfuture f, void *val, int len);
void         *CfutureWait(Cfuture f);
void          CfutureDestroy(Cfuture f);

void         *CfutureCreateBuffer(int bytes);
void          CfutureDestroyBuffer(void *val);
void          CfutureStoreBuffer(Cfuture f, void *value);

#define       CfuturePE(f) ((f).pe)

void CfutureInit(void);

/****** CLD: THE LOAD BALANCER ******/

#define CLD_ANYWHERE (-1)
#define CLD_BROADCAST (-2)
#define CLD_BROADCAST_ALL (-3)

typedef void (*CldPackFn)(void *msg);

typedef void (*CldInfoFn)(void *msg, 
                          CldPackFn *packer,
                          int *len,
                          int *queueing,
                          int *priobits, 
                          unsigned int **prioptr);

typedef int (*CldEstimator)(void);

int CldRegisterInfoFn(CldInfoFn fn);
int CldRegisterPackFn(CldPackFn fn);
void CldRegisterEstimator(CldEstimator fn);
int CldEstimate(void);
const char *CldGetStrategy(void);

void CldEnqueue(int pe, void *msg, int infofn);
void CldEnqueueMulti(int npes, const int *pes, void *msg, int infofn);
void CldEnqueueGroup(CmiGroup grp, void *msg, int infofn);
// CldNodeEnqueue enqueues a single message for a node, whereas
// CldEnqueueWithinNode enqueues a message for each PE on the node.
void CldNodeEnqueue(int node, void *msg, int infofn);
void CldEnqueueWithinNode(void *msg, int infofn);

/****** CMM: THE MESSAGE MANAGER ******/

typedef struct CmmTableStruct *CmmTable;

#define CmmWildCard (-1)

typedef void (*CmmPupMessageFn)(pup_er p,void **msg);
CmmTable CmmPup(pup_er p, CmmTable t, CmmPupMessageFn msgpup);

CmmTable   CmmNew(void);
void       CmmFree(CmmTable t);
void	   CmmFreeAll(CmmTable t);
void       CmmPut(CmmTable t, int ntags, int *tags, void *msg);
void      *CmmFind(CmmTable t, int ntags, int *tags, int *returntags, int del);
int        CmmEntries(CmmTable t);
int 	   CmmGetLastTag(CmmTable t, int ntags, int *tags);
#define    CmmGet(t,nt,tg,rt)   (CmmFind((t),(nt),(tg),(rt),1))
#define    CmmProbe(t,nt,tg,rt) (CmmFind((t),(nt),(tg),(rt),0))

/******** ConverseInit and ConverseExit ********/

void ConverseInit(int, char**, CmiStartFn, int, int);

/* Optional parameter for ConverseExit() - based on
https://stackoverflow.com/a/28074198/1250282 */

void realConverseExit(int exitcode);

#define CONVEXIT_1(x) realConverseExit(x)
#define CONVEXIT_0() CONVEXIT_1(0) /* Default ConverseExit() exit code: 0 */

#define CONV_FUNC_CHOOSER(_f1, _f2, _f3, ...) _f3
#define CONV_FUNC_RECOMPOSER(argsWithParentheses) CONV_FUNC_CHOOSER argsWithParentheses
#define CONV_CHOOSE_FROM_ARG_COUNT(...) CONV_FUNC_RECOMPOSER((__VA_ARGS__, CONVEXIT_2, CONVEXIT_1, ))
#define CONV_NO_ARG_EXPANDER() ,,CONVEXIT_0
#define CONV_MACRO_CHOOSER(...) CONV_CHOOSE_FROM_ARG_COUNT(CONV_NO_ARG_EXPANDER __VA_ARGS__ ())
#define ConverseExit(...) CONV_MACRO_CHOOSER(__VA_ARGS__)(__VA_ARGS__)


#if CMK_SHRINK_EXPAND
void ConverseCleanup(void);
#endif

CMK_NORETURN
#if defined __GNUC__ || defined __clang__
__attribute__ ((format (printf, 1, 2)))
#endif
void CmiAbort(const char *msg, ...);

void CmiOutOfMemory(int nBytes);

#if CMK_MEMCHECK_OFF
#define _MEMCHECK(p) do{}while(0)
#else
#define _MEMCHECK(p) do { \
                         if ((p)==0) CmiOutOfMemory(-1);\
                     } while(0)
#endif

/******** CONVCONDS ********/

typedef void (*CcdVoidFn)(void *userParam,double curWallTime);

/*CPU conditions*/
#define CcdPROCESSOR_BEGIN_BUSY 0
#define CcdPROCESSOR_END_IDLE 0 /*Synonym*/
#define CcdPROCESSOR_BEGIN_IDLE 1
#define CcdPROCESSOR_END_BUSY 1 /*Synonym*/
#define CcdPROCESSOR_STILL_IDLE 2
#define CcdPROCESSOR_LONG_IDLE 3

/*Periodic calls*/
#define CcdPERIODIC           4 /*every few ms*/
#define CcdPERIODIC_10ms      5 /*every 10ms (100Hz)*/
#define CcdPERIODIC_100ms     6 /*every 100ms (10Hz)*/
#define CcdPERIODIC_1second   7 /*every second*/
#define CcdPERIODIC_1s        7 /*every second*/
#define CcdPERIODIC_5s        8 /*every second*/
#define CcdPERIODIC_5seconds  8 /*every second*/
#define CcdPERIODIC_10second  9 /*every 10 seconds*/
#define CcdPERIODIC_10seconds 9 /*every 10 seconds*/
#define CcdPERIODIC_10s       9 /*every 10 seconds*/
#define CcdPERIODIC_1minute  10 /*every minute*/
#define CcdPERIODIC_2minute  11 /*every 2 minute*/
#define CcdPERIODIC_5minute  12 /*every 5 minute*/
#define CcdPERIODIC_10minute 13 /*every 10 minutes*/
#define CcdPERIODIC_1hour    14 /*every hour*/
#define CcdPERIODIC_12hour   15 /*every 12 hours*/
#define CcdPERIODIC_1day     16 /*every day*/

/*Other conditions*/
#define CcdSCHEDLOOP         17
#define CcdQUIESCENCE        18
#define CcdTOPOLOGY_AVAIL    19
#define CcdSIGUSR1           20
#define CcdSIGUSR2           21

/*User-defined conditions start here*/
#define CcdUSER              22

/*User-defined conditions end here*/
/*Conditionally defined so users can build with larger CcdUSERMAX values*/
#ifndef CcdUSERMAX
#define CcdUSERMAX          127
#endif

#define CcdIGNOREPE   -2
#if CMK_CONDS_USE_SPECIAL_CODE
typedef int (*CmiSwitchToPEFnPtr)(int pe);
extern CmiSwitchToPEFnPtr CmiSwitchToPE;
#else
#define CmiSwitchToPE(pe)  pe
#endif
void CcdCallFnAfter(CcdVoidFn fnp, void *arg, double msecs);
int CcdCallOnCondition(int condnum, CcdVoidFn fnp, void *arg);
int CcdCallOnConditionKeep(int condnum, CcdVoidFn fnp, void *arg);
void CcdCallFnAfterOnPE(CcdVoidFn fnp, void *arg, double msecs, int pe);
int CcdCallOnConditionOnPE(int condnum, CcdVoidFn fnp, void *arg, int pe);
int CcdCallOnConditionKeepOnPE(int condnum, CcdVoidFn fnp, void *arg, int pe);
void CcdCancelCallOnCondition(int condnum, int idx);
void CcdCancelCallOnConditionKeep(int condnum, int idx);
double CcdRaiseCondition(int condnum);
double CcdSetResolution(double newResolution);
double CcdResetResolution(void);
double CcdIncreaseResolution(double newResolution);

/* Command-Line-Argument handling */
void CmiArgGroup(const char *parentName,const char *groupName);
int CmiGetArgInt(char **argv,const char *arg,int *optDest);
int CmiGetArgIntDesc(char **argv,const char *arg,int *optDest,const char *desc);
int CmiGetArgLong(char **argv,const char *arg,CmiInt8 *optDest);
int CmiGetArgLongDesc(char **argv,const char *arg,CmiInt8 *optDest,const char *desc);
int CmiGetArgDouble(char **argv,const char *arg,double *optDest);
int CmiGetArgDoubleDesc(char **argv,const char *arg,double *optDest,const char *desc);
int CmiGetArgString(char **argv,const char *arg,char **optDest);
int CmiGetArgStringDesc(char **argv,const char *arg,char **optDest,const char *desc);
int CmiGetArgFlag(char **argv,const char *arg);
int CmiGetArgFlagDesc(char **argv,const char *arg,const char *desc);
void CmiDeleteArgs(char **argv,int k);
int CmiGetArgc(char **argv);
char **CmiCopyArgs(char **argv);
int CmiArgGivingUsage(void);
void CmiDeprecateArgInt(char **argv,const char *arg,const char *desc,const char *warning);

/** 
   Extract the function-return pointers listed in the stack
   up to this depth.  nSkip is the number of enclosing functions
   to skip-- for example, nSkip==0 means the retPtrs[0]
   will be the caller of CmiBacktraceRecord.  
   Returns retPtrs[0..*nLevels-1] stack pointers.
   *nLevels may be decreased if not enough levels are available.
 */
void CmiBacktraceRecord(void **retPtrs,int nSkip,int *nLevels);

/** Look up the names of these function pointers.
Caller must free() the returned array, but not the individual
strings.
*/
char **CmiBacktraceLookup(void **srcPtrs,int nLevels);

/** Print out the names of these function pointers. */
void CmiBacktracePrint(void **retPtrs,int nLevels);

/* Print (to stdout) the names of the functions that have been 
   called up to this point. nSkip is the number of routines on the
   top of the stack to *not* print out. */
void CmiPrintStackTrace(int nSkip);
int CmiIsFortranLibraryCall(void);

#if CMK_CMIDELIVERS_USE_COMMON_CODE
CpvExtern(void*, CmiLocalQueue);
#endif

char *CmiCopyMsg(char *msg, int len);

/******** Hypercube broadcast propagation (Binomial tree) ********/

/*
  This routing will receive a number k containing the dimension in the hypercube
  to be used for the broadcast, i.e. k=0 means sending only to MyPe^1, k=1 means
  sending to MyPe^2 and MyPe^1, etc.
  The array dest_pes will be filled with the id of the processors to which send,
  it has to be already allocated, the size should be at least k+1 to allow
  enough space.
  It return the number of processors to which send, i.e. the size of dest_pes.
  This may be less than k+1 due to not complete hypercubes.
  For example with pow(2,n)+2 procs and 0 broadcasting, proc pow(2,n) will
  receive from 0 in the first step but then it has only proc pow(2,n)+1 as
  destination, so most of the other dimentions will be skipped.
*/
int HypercubeGetBcastDestinations(int mype, int total_pes, int k, int *dest_pes);

/******** Immediate Messages ********/

CpvExtern(int, CmiImmediateMsgHandlerIdx);


CpvExtern(unsigned, networkProgressCount);
extern int networkProgressPeriod;

#if !CMK_MACHINE_PROGRESS_DEFINED

#define CmiNetworkProgress() 
#define CmiNetworkProgressAfter(p) 
#define CmiMachineProgressImpl()

#else

/*#ifdef __cplusplus
extern "C" 
#endif*/
void CmiMachineProgressImpl(void);

#if CMK_USE_PXSHM
void CommunicationServerPxshm(void);
#define CmiNetworkProgress() {CpvAccess(networkProgressCount) ++; \
      if(CpvAccess(networkProgressCount) >=  networkProgressPeriod) { \
          CmiMachineProgressImpl(); \
	  CommunicationServerPxshm(); \
          CpvAccess(networkProgressCount) = 0; \
      } \
}
#else
#define CmiNetworkProgress() {CpvAccess(networkProgressCount) ++; \
      if(CpvAccess(networkProgressCount) >=  networkProgressPeriod) { \
          CmiMachineProgressImpl(); \
          CpvAccess(networkProgressCount) = 0; \
      } \
}
#endif

#if CMK_USE_PXSHM
#define CmiNetworkProgressAfter(p) {CpvAccess(networkProgressCount) ++; \
      if(CpvAccess(networkProgressCount) >=  p) { \
          CmiMachineProgressImpl(); \
	  CommunicationServerPxshm(); \
          CpvAccess(networkProgressCount) = 0; \
      } \
}
#else
#define CmiNetworkProgressAfter(p) {CpvAccess(networkProgressCount) ++; \
      if(CpvAccess(networkProgressCount) >=  p) { \
          CmiMachineProgressImpl(); \
          CpvAccess(networkProgressCount) = 0; \
      } \
}
#endif

#endif

#define CmiProbeImmediateMsg CmiMachineProgressImpl

/*
   to immediate-fy a Converse message, set the most significant bit to 1
   in the Converse handler (x|0x8000). 
*/
#if CMK_IMMEDIATE_MSG
void CmiDelayImmediate(void);
#  define CmiBecomeImmediate(msg) do { \
	CmiSetHandler(msg, (CmiGetHandler(msg))|0x8000); \
     } while (0)
#  define CmiResetImmediate(msg) do { \
	CmiSetHandler(msg, (CmiGetHandler(msg))&(~0x8000)); \
     } while (0)
#  define CmiIsImmediate(msg)      ((CmiGetHandler(msg)) & 0x8000) 
#  define CmiImmediateHandler(msg) ((CmiGetHandler(msg)) ^ 0x8000)
/*
#  define CmiIsImmediate(msg)   ((CmiGetHandler(msg) == CpvAccessOther(CmiImmediateMsgHandlerIdx,0)))
#  define CmiBecomeImmediate(msg) do {\
	CmiSetXHandler(msg,CmiGetHandler(msg)); \
	CmiSetHandler(msg,CpvAccessOther(CmiImmediateMsgHandlerIdx,0)); \
     } while (0)
#  define CmiImemdiateHandler(msg) (CmiGetXHandler(msg))
*/
/* 
  for non smp and non intr based version, it returns _immRunning
  for smp, this doesnot matter - CkMyPe() comparasion normaly fails and
           non threadsafe CqsEnqueueGeneral is avoided.
*/
#if CMK_NET_VERSION && ! CMK_SMP && ! defined(CMK_CPV_IS_SMP)
extern int _immRunning;
#  define CmiImmIsRunning()        (_immRunning)
#else
#  define CmiImmIsRunning()        (0)
#endif

#else
#  define CmiBecomeImmediate(msg) /* empty */
#  define CmiResetImmediate(msg)  /* empty */
#  define CmiIsImmediate(msg)   (0)
#  define CmiImmIsRunning()       (0)
#endif

/******** Memory Fence ********/

/* ImplSelect<num> selects one of two implementations for the atomic operations depending on the number of parameters
 * e.g.) CmiMemoryAtomicIncrement(input)
 *       -> ImplSelect2(input, CmiMemoryAtomicIncrementMemOrder, CmiMemoryAtomiIncrementSimple) CmiMemoryAtomicIncrementSimple
 *       -> __sync_fetch_and_add(&input, 1)
 *
 *       CmiMemoryAtomicIncrement(input, memory_order_relaxed) (you can specify the memory consistency for each atomic operation with the C11 consistency keywords)
 *       -> ImplSelect2(input, memory_order_relaxed, CmiMemoryAtomicIncrementMemOrder, CmiMemoryAtomicIncrementSimple) CmiMemoryAtomicIncrementMemOrder
 *       -> __atomic_fetch_and_add(&input, 1, memory_order_relaxed) (if the underlying compiler supports C11)
 *       -> CmiMemoryAtomicSimple(input) -> __sync_fetch_and_add(&input, 1) (if the compiler doesn't support C11, the memory consistency keyword ignored)
 *                                       -> __asm__ __volatile__("lock incl (%0)" :: "r" (&(someInt))) (CMK_GCC_X86_ASM on)
 *                                       -> CmiLock(cmiMemoryLock); someInt=someInt+1; CmiUnlock(cmiMemoryLock); ( Sync primitives and GCC asm not supported)
 * */
#define ImplSelect2(_1, _2, NAME, ...) NAME
#define ImplSelect3(_1, _2, _3, NAME, ...) NAME
#define CmiMemoryAtomicIncrement(...) ImplSelect2(__VA_ARGS__, CmiMemoryAtomicIncrementMemOrder, CmiMemoryAtomicIncrementSimple, )(__VA_ARGS__)
#define CmiMemoryAtomicDecrement(...) ImplSelect2(__VA_ARGS__, CmiMemoryAtomicDecrementMemOrder, CmiMemoryAtomicDecrementSimple, )(__VA_ARGS__)
#define CmiMemoryAtomicFetchAndInc(...) ImplSelect3(__VA_ARGS__, CmiMemoryAtomicFetchAndIncMemOrder, CmiMemoryAtomicFetchAndIncSimple, )(__VA_ARGS__)

#if CMK_SMP

#ifdef __cplusplus /* SYNC_LANG */

}
// old versions of the <atomic> header have defects
// we need CmiRecoverType because value_type is absent
// error: ‘value_type’ in ‘struct std::atomic<unsigned int>’ does not name a type
// we need value_type to explicitly cast the literal 1
// error: no matching function for call to ‘atomic_fetch_add_explicit(std::atomic<unsigned int>*, int, std::memory_order)’
template <typename T> struct CmiRecoverType { };
template <> struct CmiRecoverType<std::atomic<int>> { using type = int; };
template <> struct CmiRecoverType<std::atomic<unsigned int>> { using type = unsigned int; };
extern "C" {

#define CmiMemoryAtomicType(type) std::atomic<type>
#define CmiMemoryAtomicIncrementMemOrder(someInt, MemModel) std::atomic_fetch_add_explicit(&(someInt), typename CmiRecoverType<decltype(someInt)>::type(1), (std::MemModel))
#define CmiMemoryAtomicDecrementMemOrder(someInt, MemModel) std::atomic_fetch_sub_explicit(&(someInt), typename CmiRecoverType<decltype(someInt)>::type(1), (std::MemModel))
#define CmiMemoryAtomicFetchAndIncMemOrder(input, output, MemModel) ((output) = std::atomic_fetch_add_explicit(&(input), typename CmiRecoverType<decltype(input)>::type(1), (std::MemModel)))
#define CmiMemoryAtomicIncrementSimple(someInt) std::atomic_fetch_add(&(someInt), typename CmiRecoverType<decltype(someInt)>::type(1))
#define CmiMemoryAtomicDecrementSimple(someInt) std::atomic_fetch_sub(&(someInt), typename CmiRecoverType<decltype(someInt)>::type(1))
#define CmiMemoryAtomicFetchAndIncSimple(input, output)((output) = std::atomic_fetch_add(&(input), typename CmiRecoverType<decltype(input)>::type(1)))

#elif defined CMK_HAS_C11_STDATOMIC /* SYNC_LANG */

#define CmiMemoryAtomicType(type) type
#define CmiMemoryAtomicIncrementMemOrder(someInt, MemModel) atomic_fetch_add_explicit(&(someInt), 1, (MemModel))
#define CmiMemoryAtomicDecrementMemOrder(someInt, MemModel) atomic_fetch_sub_explicit(&(someInt), 1, (MemModel))
#define CmiMemoryAtomicFetchAndIncMemOrder(input, output, MemModel) ((output) = atomic_fetch_add_explicit(&(input), 1, (MemModel)))
#define CmiMemoryAtomicIncrementSimple(someInt) atomic_fetch_add(&(someInt), 1)
#define CmiMemoryAtomicDecrementSimple(someInt) atomic_fetch_sub(&(someInt), 1)
#define CmiMemoryAtomicFetchAndIncSimple(input, output) ((output) = atomic_fetch_add(&(input), 1))

#else /* SYNC_LANG */

#define CmiMemoryAtomicType(type) type
/* Mem ordering is not supported */
#define CmiMemoryAtomicIncrementMemOrder(someInt, MemModel) CmiMemoryAtomicIncrementSimple(someInt)
#define CmiMemoryAtomicDecrementMemOrder(someInt, MemModel) CmiMemoryAtomicDecrementSimple(someInt)
#define CmiMemoryAtomicFetchAndIncMemOrder(input, output, MemModel) CmiMemoryAtomicFetchAndIncSimple((input), (output))

#if CMK_C_SYNC_ADD_AND_FETCH_PRIMITIVE /* SYNC_PRIM */

#define CmiMemoryAtomicIncrementSimple(someInt) __sync_fetch_and_add(&(someInt), 1)
#define CmiMemoryAtomicDecrementSimple(someInt) __sync_fetch_and_sub(&(someInt), 1)
#define CmiMemoryAtomicFetchAndIncSimple(input, output) ((output) = __sync_fetch_and_add(&(input), 1))

#elif CMK_GCC_X86_ASM /* SYNC_PRIM */

#if 1
#define CmiMemoryAtomicIncrementSimple(someInt)  __asm__ __volatile__("lock incl (%0)" :: "r" (&(someInt)))
#define CmiMemoryAtomicDecrementSimple(someInt)  __asm__ __volatile__("lock decl (%0)" :: "r" (&(someInt)))
#else /* 1 */
/* this might be slightly faster, but does not compile with -O3 on netlrts-darwin-x86_64 */
#define CmiMemoryAtomicIncrementSimple(someInt)  __asm__ __volatile__("lock incl %0" :: "m" (someInt))
#define CmiMemoryAtomicDecrementSimple(someInt)  __asm__ __volatile__("lock decl %0" :: "m" (someInt))
#endif /* 1 */
#define CmiMemoryAtomicFetchAndIncSimple(input,output) __asm__ __volatile__( \
        "movl $1, %1\n\t" \
        "lock xaddl %1, %0" \
        : "=m"(input), "=r"(output) : "m"(input) : "memory")

#else /* SYNC_PRIM */

extern CmiNodeLock cmiMemoryLock;
#define CmiMemoryAtomicIncrementSimple(someInt) { CmiLock(cmiMemoryLock); ((someInt)++); CmiUnlock(cmiMemoryLock); }
#define CmiMemoryAtomicDecrementSimple(someInt) { CmiLock(cmiMemoryLock); ((someInt)--); CmiUnlock(cmiMemoryLock); }
#define CmiMemoryAtomicFetchAndIncSimple(input,output) { CmiLock(cmiMemoryLock); ((output) = (input)++); CmiUnlock(cmiMemoryLock); }

#endif /* SYNC_PRIM */

#endif /* SYNC_LANG */

#if CMK_C_SYNC_SYNCHRONIZE_PRIMITIVE
#define CmiMemoryReadFence()                 __sync_synchronize()
#define CmiMemoryWriteFence()                __sync_synchronize()
#elif defined _MSC_VER
#define CmiMemoryReadFence()                 MemoryBarrier()
#define CmiMemoryWriteFence()                MemoryBarrier()
#elif defined __cplusplus
#define CmiMemoryReadFence()                 std::atomic_thread_fence(std::memory_order_seq_cst)
#define CmiMemoryWriteFence()                std::atomic_thread_fence(std::memory_order_seq_cst)
#elif defined CMK_HAS_C11_STDATOMIC
#define CmiMemoryReadFence()                 atomic_thread_fence(memory_order_seq_cst)
#define CmiMemoryWriteFence()                atomic_thread_fence(memory_order_seq_cst)
#else
extern CmiNodeLock cmiMemoryLock;
#define CmiMemoryReadFence()               { CmiLock(cmiMemoryLock); CmiUnlock(cmiMemoryLock); }
#define CmiMemoryWriteFence()              { CmiLock(cmiMemoryLock); CmiUnlock(cmiMemoryLock); }
#endif /* CMK_C_SYNC_SYNCHRONIZE_PRIMITIVE */

#else  /* for non-SMP, no need to define */

#define CmiMemoryAtomicType(type) type
#define CmiMemoryAtomicIncrementSimple(someInt) ((someInt)++)
#define CmiMemoryAtomicDecrementSimple(someInt) ((someInt)--)
#define CmiMemoryAtomicFetchAndIncSimple(input, output) ((output) = (input)++)
#define CmiMemoryAtomicIncrementMemOrder(someInt, MemModel) CmiMemoryAtomicIncrementSimple(someInt)
#define CmiMemoryAtomicDecrementMemOrder(someInt, MemModel) CmiMemoryAtomicDecrementSimple(someInt)
#define CmiMemoryAtomicFetchAndIncMemOrder(input, output, MemModel) CmiMemoryAtomicFetchAndIncSimple((input), (output))
#define CmiMemoryReadFence()
#define CmiMemoryWriteFence()

#endif /*if CMK_SMP*/

#ifdef __cplusplus
using CmiMemoryAtomicInt = CmiMemoryAtomicType(int);
using CmiMemoryAtomicUInt = CmiMemoryAtomicType(unsigned int);
#else
typedef CmiMemoryAtomicType(int) CmiMemoryAtomicInt;
typedef CmiMemoryAtomicType(unsigned int) CmiMemoryAtomicUInt;
#endif

/******** Performance Counters ********/
void CmiInitCounters(void);
void CmiStartCounters(int events[], int numEvents);
void CmiStopCounters(int events[], CMK_TYPEDEF_INT8 values[], int numEvents);

/******** Trace ********/

/* this is the type for thread ID, mainly used for projection. */
#define OBJ_ID_SZ 4
typedef struct _CmiObjId {
int id[OBJ_ID_SZ];
  /* 
   * **CWL** Note: setting initial values to -1 does not seem to be done for 
   *               LDObjid. Potential consistency problems could arise. This
   *               will probably have to be dealt with later.
   */
#ifdef __cplusplus
  _CmiObjId() { 
    for (int i=0; i<OBJ_ID_SZ; i++) {
      id[i] = -1;
    }
  }
  bool isNull() {
    for (int i=0; i<OBJ_ID_SZ; i++) {
      if (id[i] != -1) return false;
    }
    return true;
  }
  bool operator==(const struct _CmiObjId& objid) const {
    for (int i=0; i<OBJ_ID_SZ; i++) if (id[i] != objid.id[i]) return false;
    return true;
  }
#endif
} CmiObjId;

/* public interface for thread id acquisition */
CmiObjId *CthGetThreadID(CthThread th);
void CthSetThreadID(CthThread th, int a, int b, int c);

void CthTraceResume(CthThread t);

#if CMK_FAULT_EVAC
CpvExtern(char *,_validProcessors);
#define CmiNodeAlive(x)  (CpvAccess(_validProcessors)[x])
#endif

int CmiEndianness(void);

#if CMK_CHARMDEBUG
extern void setMemoryTypeChare(void*); /* for memory debugging */
extern void setMemoryTypeMessage(void*); /* for memory debugging */
#else
#define setMemoryTypeChare(p) /* empty memory debugging method */
#define setMemoryTypeMessage(p) /* empty memory debugging method */
#endif

#include "conv-cpm.h"
#include "conv-cpath.h"
#include "conv-qd.h"
#include "conv-random.h"
#include "conv-lists.h"
#include "conv-trace.h"
#include "persistent.h"

#include "cmirdmautils.h"

#ifdef ADAPT_SCHED_MEM
extern int numMemCriticalEntries;
extern int *memCriticalEntries;
#endif

double CmiReadSize(const char *str);

#if  CMK_CONVERSE_UGNI
void CmiTurnOnStats(void);
void CmiTurnOffStats(void);
#else
#define CmiTurnOnStats()
#define CmiTurnOffStats()
#endif

/* CharmLibInterOperate should be a global variable as it will be
 * set only once by MPI ranks respectively.
 */
extern int CharmLibInterOperate;
CpvExtern(int,charmLibExitFlag);

/******** I/O wrappers ***********/

size_t CmiFwrite(const void *ptr, size_t size, size_t nmemb, FILE *f);
CmiInt8 CmiPwrite(int fd, const char *buf, size_t bytes, size_t offset);
int CmiOpen(const char *pathname, int flags, int mode);
FILE *CmiFopen(const char *path, const char *mode);
int CmiFclose(FILE *fp);

#if CMK_HAS_LOG2
#define CmiLog2   log2
#define CmiILog2  log2
#else
extern unsigned int CmiILog2(unsigned int);
extern double CmiLog2(double);
#endif

#if defined(__cplusplus)
}                                         /* end of extern "C"  */
#endif

#if CMK_GRID_QUEUE_AVAILABLE
#if defined(__cplusplus)
extern "C" {
#endif
extern int CmiGetCluster (int pe);
extern int CmiGridQueueGetInterval (void);
extern int CmiGridQueueGetThreshold (void);
extern void CmiGridQueueRegister (int gid, int nInts, int index1, int index2, int index3);
extern void CmiGridQueueDeregister (int gid, int nInts, int index1, int index2, int index3);
extern void CmiGridQueueDeregisterAll (void);
extern int CmiGridQueueLookup (int gid, int nInts, int index1, int index2, int index3);
extern int CmiGridQueueLookupMsg (char *msg);
#if defined(__cplusplus)
}
#endif
#endif

#include "debug-conv.h"

typedef struct {
  CmiUInt4 msgSize;
  CmiUInt2 senderPe;
  CmiUInt2 destination;
} CmiFragmentHeader;

#if CMK_SMP && CMK_LEVERAGE_COMMTHREAD
#if defined(__cplusplus)
#define EXTERN extern "C"
#else
#define EXTERN extern
#endif
typedef void (*CmiCommThdFnPtr)(int numParams, void *params);
typedef struct CmiNotifyCommThdMsg {
    char core[CmiMsgHeaderSizeBytes];
    CmiCommThdFnPtr fn;
    int numParams;
    void *params;
    int toKeep; /* whether to free this msg by comm thread when the msg is processed */ 
}CmiNotifyCommThdMsg;

EXTERN CmiNotifyCommThdMsg *CmiCreateNotifyCommThdMsg(CmiCommThdFnPtr fn, int numParams, void *params, int toKeep);
EXTERN void CmiFreeNotifyCommThdMsg(CmiNotifyCommThdMsg *msg);
/* Initialize a notification msg */
EXTERN void CmiResetNotifyCommThdMsg(CmiNotifyCommThdMsg *msg, CmiCommThdFnPtr fn, int numParams, void *params, int toKeep);
/* Enqueue the msg into the local comm thread, and wait for being processed */
EXTERN void CmiNotifyCommThd(CmiNotifyCommThdMsg *msg);
#endif

CpvExtern(int, _urgentSend);
#if CMK_USE_OOB
#define CmiEnableUrgentSend(yn)   CpvAccess(_urgentSend)=(yn)
#else
#define CmiEnableUrgentSend(yn)   
#endif

#if CMK_SMP && CMK_TASKQUEUE
#include "taskqueue.h" /* for tasks queue */
#include "conv-taskQ.h" /* for standalone-OpenMP */
#define CsdTaskEnqueue(x) TaskQueuePush((TaskQueue)CpvAccess(CsdTaskQueue),x)
#define CsdTaskPop() TaskQueuePop((TaskQueue)CpvAccess(CsdTaskQueue))
#if CMK_OMP
#if defined(__cplusplus)
extern "C"
#endif
int CmiGetCurKnownOmpThreads(void);
#endif
#endif
CpvCExtern(int, isHelperOn);
#if defined(__cplusplus)
extern "C"
#endif
void CmiSetPeHelpsOtherThreads(int);
#endif /* CONVERSE_H */


